In [1]:
import pandas as pd
import numpy as np
import pickle
import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV

In [2]:
# Load LSA-transformed data from pickle file
with open('final_models/X_lsa.pkl', 'rb') as f:
    X_lsa = pickle.load(f)

In [3]:
df = pd.read_csv('../datasets/cleaned_datasets/cleaned_twitter_dataset_without_lemmatization.csv')
labels = df['target'].values

In [4]:
# Separate features and labels from the labeled data
X = df['text'].values
y = df['target'].values

In [5]:
# Define the number of folds for cross-validation
n_folds = 5

In [6]:
# Create a KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [7]:
# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20]
}

In [8]:
# Initialize lists to store the evaluation metrics for each fold
mse_scores = []
rmse_scores = []
r2_scores = []

In [9]:
# Initialize a Random Forest Regression model
model = RandomForestRegressor(random_state=42)

In [10]:
# Create a GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='neg_mean_squared_error')

In [None]:
# Fit the grid search object on the LSA-transformed data
grid_search.fit(X_lsa, y)

In [None]:
# Get the best estimator from the grid search
best_model = grid_search.best_estimator_

In [None]:
# Print the best hyperparameters
print(f"Best hyperparameters: {grid_search.best_params_}")

In [None]:
# Loop over the folds
for fold, (train_index, test_index) in enumerate(kf.split(X)):

    # Get the training and testing data for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Get the LSA-transformed data for the current fold
    X_train_lsa, X_test_lsa = X_lsa[train_index], X_lsa[test_index]

    # Train the model on the LSA-transformed training data
    best_model.fit(X_train_lsa, y_train)

    # Make predictions on the test set
    y_pred = best_model.predict(X_test_lsa)

    # Calculate evaluation metrics for the current fold
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Add the evaluation metrics to the lists
    mse_scores.append(mse)
    rmse_scores.append(rmse)
    r2_scores.append(r2)

    print(f"Fold {fold+1}: MSE = {mse:.4f}, RMSE = {rmse:.4f}, R-squared = {r2:.4f}")

In [None]:
# Print the average evaluation metrics over all folds
print(f"Average MSE over {n_folds} folds = {np.mean(mse_scores):.4f}")
print(f"Average RMSE over {n_folds} folds = {np.mean(rmse_scores):.4f}")
print(f"Average R-squared over {n_folds} folds = {np.mean(r2_scores):.4f}")

In [None]:
joblib.dump(model, 'final_models/lsa_random_forest_regression_model.joblib')