# Model Training

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import joblib

In [2]:
data = pd.read_csv('../Data/stock_sentiment_combined_data.csv')

In [3]:
columns_to_drop = ['title', 'id', 'author', 'score', 'upvote_ratio', 'num_comments', 'url', 'selftext', 'sentiment', 'sentiment_label', 'tickers', 'ticker_']
data = data.drop(columns=columns_to_drop, errors='ignore')

In [4]:
print('Shape of data after dropping irrelevant columns : ', data.shape)

Shape of data after dropping irrelevant columns :  (6932, 24)


In [5]:
data.fillna(data.mean(), inplace=True)
print('Remaining missing values per column :\n',data.isnull().sum())


Remaining missing values per column :
 Adj Close_AI      0
Close_AI          0
High_AI           0
Low_AI            0
Open_AI           0
Volume_AI         0
Adj Close_MSTR    0
Close_MSTR        0
High_MSTR         0
Low_MSTR          0
Open_MSTR         0
Volume_MSTR       0
Adj Close_NVDA    0
Close_NVDA        0
High_NVDA         0
Low_NVDA          0
Open_NVDA         0
Volume_NVDA       0
Adj Close_VOO     0
Close_VOO         0
High_VOO          0
Low_VOO           0
Open_VOO          0
Volume_VOO        0
dtype: int64


In [6]:
print('Columns in the dataset : ',data.columns)

Columns in the dataset :  Index(['Adj Close_AI', 'Close_AI', 'High_AI', 'Low_AI', 'Open_AI', 'Volume_AI',
       'Adj Close_MSTR', 'Close_MSTR', 'High_MSTR', 'Low_MSTR', 'Open_MSTR',
       'Volume_MSTR', 'Adj Close_NVDA', 'Close_NVDA', 'High_NVDA', 'Low_NVDA',
       'Open_NVDA', 'Volume_NVDA', 'Adj Close_VOO', 'Close_VOO', 'High_VOO',
       'Low_VOO', 'Open_VOO', 'Volume_VOO'],
      dtype='object')


In [7]:
data['Movement'] = data.apply(lambda row: 'up' if row['Close_AI'] > row['Open_AI'] else 'down', axis=1)

In [8]:
features = ['Adj Close_AI', 'High_AI', 'Low_AI', 'Open_AI', 'Volume_AI',
            'Adj Close_MSTR', 'High_MSTR', 'Low_MSTR', 'Open_MSTR', 'Volume_MSTR',
            'Adj Close_NVDA', 'High_NVDA', 'Low_NVDA', 'Open_NVDA', 'Volume_NVDA',
            'Adj Close_VOO', 'High_VOO', 'Low_VOO', 'Open_VOO', 'Volume_VOO']

In [9]:
missing_columns = [col for col in features if col not in data.columns]
if missing_columns:
    print(f'Missing required columns : {missing_columns}')
    exit()

X = data[features]
y = data['Movement']

In [10]:
print('Shape of features (X) :',X.shape)
print('Shape of target (y) : ',y.shape)

Shape of features (X) : (6932, 20)
Shape of target (y) :  (6932,)


In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [13]:
param_grid = {
    'n_estimators' : [100,150,200],
    'max_depth' : [10,15,20],
    'min_samples_split' : [2,5,10],
    'min_samples_leaf' : [1,2,4]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=1)
grid_search.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [14]:
best_model = grid_search.best_estimator_
print('Best Hyperparameters : ', grid_search.best_params_)

Best Hyperparameters :  {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [15]:
y_pred = best_model.predict(X_test)

In [16]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='up')
recall = recall_score(y_test, y_pred, pos_label='up')

In [17]:
print('Model Performance')
print(f'Accuracy : {accuracy}')
print(f'Precision : {precision}')
print(f'Recall : {recall}')

Model Performance
Accuracy : 1.0
Precision : 1.0
Recall : 1.0


In [18]:
print('Classification Report')
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

        down       1.00      1.00      1.00       182
          up       1.00      1.00      1.00      1205

    accuracy                           1.00      1387
   macro avg       1.00      1.00      1.00      1387
weighted avg       1.00      1.00      1.00      1387



In [19]:
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Confusion Matrix
[[ 182    0]
 [   0 1205]]


In [20]:
cv_scores = cross_val_score(best_model, X_scaled, y, cv=5, scoring='accuracy')
print('Cross-Validated Accuracy : ',cv_scores.mean())

Cross-Validated Accuracy :  0.9819754866618602


In [21]:
joblib.dump(best_model, '../Models/stock_movement_classifier.pkl')
joblib.dump(scaler, '../Models/scaler.pkl')
print('Model and Scaler have been saved in the Models Folder')

Model and Scaler have been saved in the Models Folder
