In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
# 1. Load Dataset
df = pd.read_csv('../dataset/crypto_sentiment_prediction_dataset.csv')

In [3]:
# 1. Feature Engineering: Create Target Variable
# Target = 1 if Price Change > 0 (Bullish), else 0 (Bearish)
df['target'] = (df['price_change_24h_percent'] > 0).astype(int)

# 2. Select Features and Drop Leakage/Redundant Columns
X = df.drop(columns=['timestamp', 'price_change_24h_percent', 
                     'prediction_confidence', 'target'])
y = df['target']

# 3. Define Preprocessing Pipeline
# Scale numerical features and One-Hot Encode categorical features
categorical_features = ['cryptocurrency']
numeric_features = [col for col in X.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [13]:
# Safely compute Pearson correlations using numeric columns only
num_df = df.select_dtypes(include=[np.number])
correlations = num_df.corr()['price_change_24h_percent'].sort_values(ascending=False)

# Debug: show available correlation keys
print("Correlation keys:", list(correlations.index))

# Safely print specific sentiment correlations
for key in ['social_sentiment_score', 'fear_greed_index']:
    val = correlations.get(key)
    if val is None or pd.isna(val):
        print(f"{key}: missing or non-numeric (NaN)")
    else:
        # format float safely
        print(f"{key} Correlation: {val:.4f}")

Correlation keys: ['price_change_24h_percent', 'target', 'market_cap_usd', 'current_price_usd', 'rsi_technical_indicator', 'volatility_index', 'news_sentiment_score', 'social_sentiment_score', 'prediction_confidence', 'trading_volume_24h', 'news_impact_score', 'fear_greed_index', 'social_mentions_count']
social_sentiment_score Correlation: 0.0106
fear_greed_index Correlation: -0.0152


In [5]:
# 5. Train-Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Define the four baseline models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

# Training Loop using the Pipeline
for name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Logistic Regression Accuracy: 0.4964
Random Forest Accuracy: 0.4964
XGBoost Accuracy: 0.4673
SVM Accuracy: 0.4576


In [7]:
# 6. Training Loop & Initial Evaluation
print("--- Baseline Model Results ---")
for name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"\nModel: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

--- Baseline Model Results ---

Model: Logistic Regression
Accuracy: 0.4964

Model: Random Forest
Accuracy: 0.4964

Model: XGBoost
Accuracy: 0.4673

Model: SVM
Accuracy: 0.4576


In [14]:
# Define hyperparameter grids for all models
# Note: 'classifier__' prefix is required to access model params inside the Pipeline
param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__solver': ['liblinear']  # Good for small datasets
    },
    'Random Forest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5]
    },
    'XGBoost': {
        'classifier__n_estimators': [50, 100],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    },
    'SVM': {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['rbf', 'linear'],
        'classifier__gamma': ['scale', 'auto']
    }
}

print("--- Comprehensive Hyperparameter Tuning ---")
best_models = {}

# Loop through each model and perform GridSearchCV
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])
    
    # Skip tuning if model not in our grid definition (safety check)
    if name in param_grids:
        print(f"\nTuning {name}...")
        grid = GridSearchCV(pipeline, param_grids[name], cv=3, scoring='accuracy', n_jobs=-1)
        grid.fit(X_train, y_train)
        
        best_score = grid.best_score_
        best_params = grid.best_params_
        test_acc = accuracy_score(y_test, grid.best_estimator_.predict(X_test))
        
        best_models[name] = {'test_accuracy': test_acc, 'best_params': best_params}
        print(f"Best CV Score: {best_score:.4f}")
        print(f"Test Set Accuracy: {test_acc:.4f}")
        print(f"Best Params: {best_params}")

--- Comprehensive Hyperparameter Tuning ---

Tuning Logistic Regression...
Best CV Score: 0.5067
Test Set Accuracy: 0.4939
Best Params: {'classifier__C': 0.01, 'classifier__solver': 'liblinear'}

Tuning Random Forest...
Best CV Score: 0.5055
Test Set Accuracy: 0.5400
Best Params: {'classifier__max_depth': 20, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}

Tuning XGBoost...
Best CV Score: 0.5133
Test Set Accuracy: 0.4673
Best Params: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}

Tuning SVM...
Best CV Score: 0.5152
Test Set Accuracy: 0.4625
Best Params: {'classifier__C': 10, 'classifier__gamma': 'auto', 'classifier__kernel': 'rbf'}
