In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
start_date = '2020-01-01'
end_date = '2025-05-01'
tickers = ['BTC-USD', 'ETH-USD']

In [None]:
data = yf.download(tickers, start_date, end_date)

In [None]:
data = data.reset_index()
data.head()

In [None]:
data = data.rename(columns={
    'Date': 'date',
    'BTC-USD': 'btc',
    'ETH-USD': 'eth',
    'Close': 'close',
    'Open': 'open',
    'High': 'high',
    'Low': 'low',
    'Volume': 'volume'
})
data.head()

In [None]:
data.columns = ['_'.join(col).strip() for col in data.columns]
data.head()

In [None]:
data.dtypes

In [None]:
data['date_'] = pd.to_datetime(data['date_'])
data = data.rename(columns={'date_': 'date'})
data.head(6)

In [None]:
data.to_csv('eth_btc_5_year.csv', index=False)
print('Saved to csv file')

In [None]:
df = pd.read_csv('eth_btc_5_year.csv')
df['date_'] = pd.to_datetime(df['date_'])
df.head()

In [None]:
df.loc[1:5, 'close_btc'] = float('nan')
df.loc[2:4, 'volume_eth'] = 0
df.head(6)

In [None]:
df.isna().sum()

In [None]:
df = df.replace(0, np.nan)
df.isna().sum()

In [None]:
df = df.interpolate()
df.head(6)

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
df = df.set_index('date_')
# df.head()

In [None]:
df.describe

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['volume_btc'], label='BTC Volume', color='blue')
plt.plot(df.index, df['volume_eth'], label='ETH Volume', color='orange')
plt.title('BTC and ETH Trading Volumes (2020 - 01) after interpolation')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.legend()
plt.grid(True)

# Step 4: Save the plot to a file
plt.savefig('btc_eth_volumes_interpolated.png')

# Step 5: Display the plot
plt.show()

# Step 6: Close the figure to free memory
plt.close()

In [None]:
max_volume = df['volume_btc'].max()
max_volume

In [None]:
max_volume_date = df.index[df['volume_btc'] == max_volume][0]
max_volume_date

In [None]:
# Use IQR to detect outliers in volume_btc
Q1 = df['volume_btc'].quantile(0.25)
Q3 = df['volume_btc'].quantile(.75)

IQR = Q3 - Q1

upper_bounds = Q3 + 1.5 * IQR
lower_bounds = Q1 - 1.5 * IQR

df_no_outliers = df[(df['volume_btc'] >= lower_bounds) & (df['volume_btc'] <= upper_bound)]
df_no_outliers.head()

In [None]:
# Step 5: Print summary
print("Original DataFrame shape:", df.shape)
print("DataFrame shape after removing outliers:", df_no_outliers.shape)
print("\nOutliers removed (if any):")
outliers = df[(df['volume_btc'] < lower_bound) | (df['volume_btc'] > upper_bound)]
if not outliers.empty:
    for date, row in outliers.iterrows():
        print(f"Date: {date}, Volume: {row['volume_btc']}")
else:
    print("No outliers detected.")

In [None]:
# Step 6: Visualize original and cleaned data
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['volume_btc'], label='Original BTC Volume', color='blue', alpha=0.5)
plt.plot(df_no_outliers.index, df_no_outliers['volume_btc'], label='BTC Volume (No Outliers)', color='green')
plt.plot(df.index, df['volume_eth'], label='ETH Volume', color='orange', alpha=0.5)
if not outliers.empty:
    plt.scatter(outliers.index, outliers['volume_btc'], color='red', label='Outliers', zorder=5)
plt.title('BTC and ETH Trading Volumes (2020-01) with Outlier Analysis')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.legend()
plt.grid(True)
plt.savefig('btc_eth_volumes_outlier_analysis.png')
plt.show()
plt.close()

In [None]:
df_no_outliers.info()

In [None]:
df.info()