<a href="https://colab.research.google.com/github/Ritikkumar9828155803/assignments/blob/main/primetradeai%20assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime

In [13]:
import warnings
warnings.filterwarnings('ignore')

Uploading dataset and data cleaning

In [14]:
def load_and_clean_data():

    trader_data = pd.read_csv('historical data.csv')


    trader_data['Timestamp IST'] = pd.to_datetime(trader_data['Timestamp IST'], format='%d-%m-%Y %H:%M')
    trader_data['Size USD'] = pd.to_numeric(trader_data['Size USD'], errors='coerce').fillna(0)
    trader_data['Execution Price'] = pd.to_numeric(trader_data['Execution Price'], errors='coerce').fillna(0)
    trader_data['Fee'] = pd.to_numeric(trader_data['Fee'], errors='coerce').fillna(0)


    trader_data = trader_data[trader_data['Side'] == 'BUY']


    trader_data['Date'] = trader_data['Timestamp IST'].dt.date
    aggregated_data = trader_data.groupby('Date').agg({
        'Size USD': 'sum',
        'Execution Price': 'mean',
        'Fee': 'sum',
        'Timestamp IST': 'count'
    }).rename(columns={
        'Size USD': 'total_buy_volume',
        'Execution Price': 'avg_execution_price',
        'Fee': 'total_fees',
        'Timestamp IST': 'trade_count'
    }).reset_index()


    sentiment_data = pd.read_csv('fear_greed index.csv')


    sentiment_data['date'] = pd.to_datetime(sentiment_data['date'], format='%Y-%m-%d')
    sentiment_data['Date'] = sentiment_data['date'].dt.date


    sentiment_map = {
        'Extreme Fear': 1,
        'Fear': 2,
        'Neutral': 3,
        'Greed': 4,
        'Extreme Greed': 5
    }
    sentiment_data['sentiment_value'] = sentiment_data['classification'].map(sentiment_map)


    trader_dates = set(aggregated_data['Date'])
    sentiment_dates = set(sentiment_data['Date'])
    if not trader_dates.intersection(sentiment_dates):
        print("Warning: No overlapping dates between trader data and sentiment data. All sentiment values will be imputed as Neutral (3).")


    merged_data = pd.merge(aggregated_data, sentiment_data[['Date', 'sentiment_value']], on='Date', how='left')
    merged_data['sentiment_value'] = merged_data['sentiment_value'].fillna(3)  # Default to Neutral

    return merged_data

In [15]:
def engineer_features(df):

    df['prev_day_volume'] = df['total_buy_volume'].shift(1).fillna(0)
    df['prev_day_price'] = df['avg_execution_price'].shift(1).fillna(df['avg_execution_price'].mean())


    features = ['sentiment_value', 'avg_execution_price', 'total_fees', 'trade_count', 'prev_day_volume', 'prev_day_price']
    target = 'total_buy_volume'

    X = df[features]
    y = df[target]


    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y, features

Model selection

In [16]:
def train_and_evaluate_models(X, y, features):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42)
    }


    rf_param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    }

    gb_param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    }

    results = {}
    best_model = None
    best_score = float('-inf')
    predictions = None

    for name, model in models.items():
        print(f"\nTraining {name}...")


        if name in ['Random Forest', 'Gradient Boosting']:
            param_grid = rf_param_grid if name == 'Random Forest' else gb_param_grid
            grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            model = grid_search.best_estimator_
            print(f"Best parameters for {name}: {grid_search.best_params_}")


        model.fit(X_train, y_train)


        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)


        cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')

        results[name] = {
            'MSE': mse,
            'R2': r2,
            'CV R2 Mean': cv_scores.mean(),
            'CV R2 Std': cv_scores.std()
        }

        print(f"{name} - MSE: {mse:.2f}, R2: {r2:.4f}, CV R2 Mean: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


        if name in ['Random Forest', 'Gradient Boosting']:
            importance = pd.DataFrame({
                'Feature': features,
                'Importance': model.feature_importances_
            }).sort_values(by='Importance', ascending=False)
            print(f"\nFeature Importance for {name}:\n{importance}")


        if r2 > best_score:
            best_score = r2
            best_model = model
            predictions = y_pred


    pd.DataFrame({
        'Actual': y_test,
        'Predicted': predictions
    }).to_csv('predictions.csv', index=False)

    return results, best_model


if __name__ == "__main__":

    data = load_and_clean_data()


    X, y, features = engineer_features(data)


    results, best_model = train_and_evaluate_models(X, y, features)


    print("\nSummary of Model Performance:")
    for name, metrics in results.items():
        print(f"{name}:")
        print(f"  MSE: {metrics['MSE']:.2f}")
        print(f"  R2: {metrics['R2']:.4f}")
        print(f"  CV R2 Mean: {metrics['CV R2 Mean']:.4f} ± {metrics['CV R2 Std']:.4f}")

    print("\nBest model saved predictions to 'predictions.csv'")


Training Linear Regression...
Linear Regression - MSE: 422141274615.43, R2: 0.9752, CV R2 Mean: -1.8309 ± 3.2578

Training Random Forest...
Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Random Forest - MSE: 1733133719528.23, R2: 0.8980, CV R2 Mean: -2.0185 ± 5.3627

Feature Importance for Random Forest:
               Feature  Importance
2           total_fees    0.942594
3          trade_count    0.032790
1  avg_execution_price    0.009583
4      prev_day_volume    0.007958
0      sentiment_value    0.005371
5       prev_day_price    0.001703

Training Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Gradient Boosting - MSE: 2287541342814.39, R2: 0.8654, CV R2 Mean: -0.9718 ± 2.1649

Feature Importance for Gradient Boosting:
               Feature  Importance
2           total_fees    0.937612
3          trade_count    0.032416
1  avg_execution_price    0.011808


In [17]:
from google.colab import files

files.download('predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Model Performance:
#Linear Regression’s Success:
The high R² (0.9752) suggests that total_buy_volume has a strong linear relationship with features, particularly total_fees.

This makes Linear Regression a surprisingly effective model despite its simplicity.


#Overfitting Concerns:
 Negative CV R² scores (e.g., -1.8309 for Linear Regression) indicate that all models struggle to generalize, likely due to a small dataset (possibly only a few days of data).

The high variance in CV R² for Random Forest (5.3627) suggests instability.

#High MSE:
The large MSE values (e.g., 422.14B for Linear Regression) reflect the scale of total_buy_volume (likely in millions or billions USD). Log-transformation could normalize this.

##Feature Importance:
#Total Fees Dominance:
The ~94% importance of total_fees in both Random Forest and Gradient Boosting indicates that fees are a proxy for trade size, as larger trades incur higher fees. This is expected in trading data.
#Low Sentiment Impact:
The minimal importance of sentiment_value (~0.5%) is likely due to the 2018 sentiment data not aligning with 2024 trader data, causing all values to default to Neutral (3).
#Other Features:
trade_count (3-4%) and avg_execution_price (~1%) have minor contributions, suggesting that trading frequency and price levels play secondary roles.

##Actionable Insights:
#Fee Optimization:
Traders should consolidate trades to reduce fees, especially during high-volume periods.
#Contrarian Opportunities:
The high buy volume on February 12, 2024 (Fear day) suggests traders were buying aggressively, possibly anticipating a rebound. This could be a strategy to explore.
#Sentiment Data Fix:
Acquiring 2024 sentiment data is critical to assess its true impact on trading behavior.