In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from scipy.sparse import hstack
import numpy as np

# Load dataset
df = pd.read_csv('reddit_historical_data.csv')

# Drop irrelevant columns
df_clean = df.drop(columns=["Post_ID", "Platform", "Hashtag", "Created_At", "Text"])

# Fill missing values
df_clean.fillna("Unknown", inplace=True)

# Encode categorical variables
label_encoders = {}
for col in ["Content_Type", "Subreddit", "Keyword"]:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# Encode target variable
le_target = LabelEncoder()
df_clean["Engagement_Level"] = le_target.fit_transform(df_clean["Engagement_Level"])

# TF-IDF vectorization for Title
tfidf = TfidfVectorizer(max_features=300, stop_words='english', ngram_range=(1,2))
title_tfidf = tfidf.fit_transform(df_clean["Title"])

df_clean.drop(columns=["Title"], inplace=True)

# Split data into train and test sets
X = df_clean.drop(columns=["Engagement_Level"])
y = df_clean["Engagement_Level"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply TF-IDF transformation to train and test sets
X_train_tfidf = tfidf.transform(df.loc[X_train.index, "Title"])
X_test_tfidf = tfidf.transform(df.loc[X_test.index, "Title"])

# Combine TF-IDF features with numerical features
X_train_combined = hstack([X_train, X_train_tfidf])
X_test_combined = hstack([X_test, X_test_tfidf])

# Standardize features
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train_combined)
X_test_scaled = scaler.transform(X_test_combined)

# Apply PCA for dimensionality reduction
n_components = min(100, X_train_scaled.shape[1])  # Limit to 100 or total available features
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Train RandomForestClassifier on PCA-transformed data
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
rf_model.fit(X_train_pca, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test_pca)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=le_target.classes_)

print("Accuracy:", accuracy)
print("Mean Squared Error:", mse)
print("Classification Report:\n", classification_rep)

# Predict future engagement levels (random example for demonstration)
future_data = np.random.rand(1, X_train_scaled.shape[1])  # Generate random data
future_data_pca = pca.transform(future_data)
future_prediction = rf_model.predict(future_data_pca)
print("Predicted future engagement level:", le_target.inverse_transform(future_prediction))


Accuracy: 0.9811422413793104
Mean Squared Error: 0.023706896551724137
Classification Report:
               precision    recall  f1-score   support

        High       1.00      0.99      1.00       616
         Low       0.98      0.99      0.98      2052
      Medium       0.98      0.95      0.97      1044

    accuracy                           0.98      3712
   macro avg       0.98      0.98      0.98      3712
weighted avg       0.98      0.98      0.98      3712

Predicted future engagement level: ['Medium']
