In [None]:
# We import necessary libraries for data handling, preprocessing, modeling, evaluation, and visualization.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
print("""
# Bitcoin Price Prediction Project
**Team Members**:  
- John Doe (Data Analyst)  
- Jane Smith (ML Engineer)
""")

In [None]:
print("""
## Introduction & Problem Statement
Bitcoin is a highly volatile cryptocurrency, and predicting its price can help investors make better decisions. 
This project uses simple machine learning models to predict Bitcoin's daily closing price using historical data 
from 2014 to 2024. As beginners, we focus on easy-to-understand algorithms like Linear Regression, Decision Tree, 
and Random Forest to learn the basics of machine learning.
""")

In [None]:
df = pd.read_csv('BTC-USD.csv')
print("## Dataset Description")
print(f"- **Source**: BTC-USD.csv (historical Bitcoin prices in USD)")
print(f"- **Rows**: {df.shape[0]}, **Columns**: {df.shape[1]}")
print(f"- **Columns**: {', '.join(df.columns)}")
print(f"- **Target Variable**: Close (daily closing price in USD)")
print("\nFirst 5 rows:")
print(df.head())

In [None]:
print("## Data Preprocessing")
df = df.dropna()
df['Date'] = pd.to_datetime(df['Date'])
df = df.drop(columns=['Adj Close'])
print(f"After cleaning, dataset has {df.shape[0]} rows.")
print("Missing values:", df.isnull().sum().sum())

In [None]:
print("## Feature Engineering")
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['SMA7'] = df['Close'].rolling(window=7).mean()
df['SMA14'] = df['Close'].rolling(window=14).mean()
df = df.dropna()
print("New features added: Year, Month, Day, SMA7, SMA14")
print("Dataset shape:", df.shape)

In [None]:
print("## Visualization: Closing Price Over Time")
plt.figure(figsize=(10, 6))
plt.plot(df['Date'], df['Close'], color='blue', label='Closing Price')
plt.title('Bitcoin Closing Price (2014-2024)')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid()
plt.savefig('closing_price.png')
plt.show()

In [None]:
print("## Visualization: Yearly Average Closing Price")
yearly_avg = df.groupby('Year')['Close'].mean().reset_index()
plt.figure(figsize=(10, 6))
plt.bar(yearly_avg['Year'], yearly_avg['Close'], color='green')
plt.title('Yearly Average Bitcoin Closing Price')
plt.xlabel('Year')
plt.ylabel('Average Price (USD)')
plt.grid()
plt.savefig('yearly_avg_price.png')
plt.show()

In [None]:
print("## Model Selection & Training Process")
features = ['Open', 'High', 'Low', 'Volume', 'Year', 'Month', 'Day', 'SMA7', 'SMA14']
X = df[features]
y = df['Close']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Features used:", features)
print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}
predictions = {}
metrics = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions[name] = y_pred
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    metrics.append({'Model': name, 'R2': r2, 'RMSE': rmse})
    print(f"{name} - R2: {r2:.4f}, RMSE: {rmse:.2f}")

In [None]:
print("## Visualization: Actual vs Predicted Prices")
for name, y_pred in predictions.items():
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.5, color='purple')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.title(f'Actual vs Predicted Prices ({name})')
    plt.xlabel('Actual Price (USD)')
    plt.ylabel('Predicted Price (USD)')
    plt.grid()
    plt.savefig(f'actual_vs_pred_{name.lower().replace(" ", "_")}.png')
    plt.show()

In [None]:
print("## Results & Model Comparison")
metrics_df = pd.DataFrame(metrics)
plt.figure(figsize=(10, 6))
plt.bar(metrics_df['Model'], metrics_df['R2'], color='skyblue')
plt.title('Model Performance (R² Score)')
plt.ylabel('R² Score')
plt.grid()
plt.savefig('model_r2_comparison.png')
plt.show()

plt.figure(figsize=(10, 6))
plt.bar(metrics_df['Model'], metrics_df['RMSE'], color='salmon')
plt.title('Model Performance (RMSE)')
plt.ylabel('RMSE (USD)')
plt.grid()
plt.savefig('model_rmse_comparison.png')
plt.show()

In [None]:
print("""
## Insights & Conclusion
- **Price Trends**: Bitcoin's price grew significantly from ~$400 in 2014 to a peak of ~$69,000 in 2021, with notable volatility.
- **Model Performance**: Random Forest performed best (highest R², lowest RMSE), followed by Linear Regression. Decision Tree was less accurate due to overfitting.
- **Interesting Fact**: The highest closing price in 2023 was $44,670 on December 5, reflecting a strong recovery.
- **Conclusion**: Simple ML models can predict Bitcoin prices with reasonable accuracy, especially Random Forest, which handles non-linear patterns well.
""")

In [None]:
print("""
## Future Scope & Possible Improvements
- **More Features**: Include external factors like market sentiment or macroeconomic indicators.
- **Hyperparameter Tuning**: Optimize Random Forest parameters (e.g., number of trees) for better accuracy.
- **Advanced Models**: Try gradient boosting algorithms like XGBoost after mastering basics.
- **Real-Time Data**: Use live data for more practical predictions.
""")