In [1]:
# Block 1: Import Libraries
import os
import json
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import mord
from sklearn.feature_selection import SelectKBest, chi2


In [2]:
# Block 2: Define Paths for Data and Output

# Define the root path of your project on Google Drive
project_root = '../../'

# Define the path to your processed data
processed_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'ML_Data')

# Define the path to the golden dataset
golden_path = os.path.join(project_root, 'Data', 'Historical Reddit', 'golden_dataset_sentiment.csv')

# Define the base output directory structure within the project root
base_output_dir = os.path.join(project_root, 'outputs', 'sentiment_analysis', 'ordinal_logistic_regression')
model_output_dir = os.path.join(base_output_dir, 'models')      # For saving the model and preprocessors
results_output_dir = os.path.join(base_output_dir, 'results')   # For saving per-subreddit predictions
# evaluation_output_dir = os.path.join(base_output_dir, 'evaluations') # Optional: If you save evaluations

print(f"Project root (Google Drive): {project_root}")
print(f"Looking for data in: {processed_dir}")
print(f"Using golden dataset from: {golden_path}")
print(f"Saving outputs to: {base_output_dir}") # Base output path for Ordinal Logistic Regression Sentiment

# Create all necessary output directories if they don't exist
os.makedirs(model_output_dir, exist_ok=True)
os.makedirs(results_output_dir, exist_ok=True)
# os.makedirs(evaluation_output_dir, exist_ok=True) # Uncomment if needed later


Project root (Google Drive): ../../
Looking for data in: ../../Data\Historical Reddit\ML_Data
Using golden dataset from: ../../Data\Historical Reddit\golden_dataset_sentiment_non_batch.csv
Saving outputs to: ../../outputs\sentiment_analysis\ordinal_logistic_regression


In [3]:
# Block 3: Load and Merge Data from JSON Files and the Golden Dataset
dfs = []
for fname in os.listdir(processed_dir):
    if fname.startswith('ml_r_') and fname.endswith('.json'):
        file_path = os.path.join(processed_dir, fname)
        with open(file_path, 'r', encoding='utf-8') as f:
            data_json = json.load(f)
        df = pd.DataFrame(data_json)
        dfs.append(df)
all_posts = pd.concat(dfs, ignore_index=True)

# Load the golden dataset that contains sentiment labels
golden = pd.read_csv(golden_path)
data = all_posts.merge(golden[['id', 'sentiment']], on='id', how='inner')

# Add basic text statistics as features
data['text_length'] = data['processed_tokens_ml'].apply(len)
data['unique_words'] = data['processed_tokens_ml'].apply(lambda x: len(set(x)))
data['avg_word_length'] = data['processed_tokens_ml'].apply(lambda x: np.mean([len(w) for w in x]) if x else 0)


In [4]:
# Block 4: Preprocess Data for Machine Learning
# Join tokens to form text for vectorization
data['text'] = data['processed_tokens_ml'].apply(lambda toks: ' '.join(toks))
X = data['text']
y = data['sentiment']

# Add numerical features
X_num = data[['text_length', 'unique_words', 'avg_word_length']].values


In [5]:
# Block 5: Split Data into Training, Validation, and Test Sets
# First split: 80% train+val, 20% test
X_trainval, X_test, X_num_trainval, X_num_test, y_trainval, y_test = train_test_split(
    X, X_num, y, stratify=y, test_size=0.2, random_state=42
)

# Second split: Split train+val into train and val
X_train, X_val, X_num_train, X_num_val, y_train, y_val = train_test_split(
    X_trainval, X_num_trainval, y_trainval, stratify=y_trainval, test_size=0.2, random_state=42
)

In [6]:
# Block 6a: Tune max_features and k parameters together (Focus on MSE)
# Define parameter grids for tuning
max_features_values = [10000, 15000, 20000]
k_values = [8000, 10000, 12000] # Adjust k based on max_features if needed

# Initialize best parameters
best_max_features = None
best_k = None
best_f1 = -1.0 # F1 is secondary
best_mse = float('inf') # Primary metric is MSE

print("--- Tuning max_features and k (using ngram=(1,2), alpha=1.0) ---")

# Tune max_features and k together
for max_features in max_features_values:
    for k in k_values:
        # Ensure k is not greater than max_features
        current_k = min(k, max_features)

        print(f"\nTrying max_features={max_features}, k={current_k}")

        # Initialize vectorizer with current parameters
        vectorizer = TfidfVectorizer(
            max_features=max_features,
            ngram_range=(1,2), # Keep ngram range fixed for now (based on previous best guess)
            min_df=2,
            max_df=0.95,
            sublinear_tf=True
        )

        # Transform training data
        X_train_vec = vectorizer.fit_transform(X_train)

        # Feature selection
        # Note: Need y_train which has original indices corresponding to X_train
        selector = SelectKBest(chi2, k=current_k)
        X_train_selected = selector.fit_transform(X_train_vec, y_train)

        # Train a simple model for evaluation (using a fixed alpha for this stage)
        model = mord.LogisticAT(alpha=1.0) # Use a default alpha, tune later
        model.fit(X_train_selected, y_train)

        # Evaluate on validation set
        # Note: Need y_val which has original indices corresponding to X_val
        X_val_vec = vectorizer.transform(X_val)
        X_val_selected = selector.transform(X_val_vec)
        y_val_pred = model.predict(X_val_selected)
        f1 = f1_score(y_val, y_val_pred, average='weighted') # Calculate F1 for info
        mse = mean_squared_error(y_val, y_val_pred)

        print(f"Validation F1-score: {f1:.4f}")
        print(f"Validation MSE: {mse:.4f}")

        # --- Select based purely on best MSE ---
        if mse < best_mse:
            best_f1 = f1 # Update F1 for reporting if needed
            best_mse = mse
            best_max_features = max_features
            best_k = current_k
            print(f"New best parameters! max_features={max_features}, k={current_k}")

print(f"\nBest parameters based on MSE (fixed alpha, ngram):")
print(f"max_features: {best_max_features}")
print(f"k: {best_k}")
print(f"Best Validation F1-score (corresponding): {best_f1:.4f}")
print(f"Best Validation MSE: {best_mse:.4f}")


--- Tuning max_features and k (using ngram=(1,3), alpha=1.0) ---

Trying max_features=10000, k=8000
Validation F1-score: 0.4511
Validation MSE: 0.8573
New best parameters! max_features=10000, k=8000

Trying max_features=10000, k=10000
Validation F1-score: 0.4504
Validation MSE: 0.8664

Trying max_features=10000, k=10000
Validation F1-score: 0.4504
Validation MSE: 0.8664

Trying max_features=15000, k=8000
Validation F1-score: 0.4503
Validation MSE: 0.8564
New best parameters! max_features=15000, k=8000

Trying max_features=15000, k=10000
Validation F1-score: 0.4506
Validation MSE: 0.8529
New best parameters! max_features=15000, k=10000

Trying max_features=15000, k=12000
Validation F1-score: 0.4504
Validation MSE: 0.8564

Trying max_features=20000, k=8000
Validation F1-score: 0.4515
Validation MSE: 0.8551

Trying max_features=20000, k=10000
Validation F1-score: 0.4505
Validation MSE: 0.8522
New best parameters! max_features=20000, k=10000

Trying max_features=20000, k=12000
Validation F

In [7]:
# --- Block 6b: Tune ngram parameter (using best max_features and k) ---
# Define parameter grid for ngram tuning
ngram_values = [(1,1), (1,2), (1,3)] # (2,2) and (2,3) performed poorly before

# Initialize best parameters (using results from 6a if available, else defaults)
best_ngram = None
# Use the best MSE found in 6a as the initial benchmark if available
best_mse_ngram = best_mse if 'best_mse' in locals() and best_mse != float('inf') else float('inf')
# best_f1_ngram = -1.0

print("\n--- Tuning ngram_range (using best max_features and k, alpha=1.0) ---")

# Use best parameters found in Block 6a (or defaults if tuning failed/skipped)
current_max_features = best_max_features if best_max_features is not None else 20000
current_k = best_k if best_k is not None else 10000

print(f"Using max_features={current_max_features}, k={current_k}")

# Tune ngram parameter
for ngram in ngram_values:
    print(f"\nTrying ngram={ngram}")

    # Initialize vectorizer with current parameters
    vectorizer_ngram = TfidfVectorizer(
        max_features=current_max_features,
        ngram_range=ngram,
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )

    # Transform training data
    X_train_vec_ngram = vectorizer_ngram.fit_transform(X_train)

    # Feature selection with best k
    selector_ngram = SelectKBest(chi2, k=current_k)
    X_train_selected_ngram = selector_ngram.fit_transform(X_train_vec_ngram, y_train)

    # Train a simple model for evaluation (fixed alpha)
    model_ngram = mord.LogisticAT(alpha=1.0)
    model_ngram.fit(X_train_selected_ngram, y_train)

    # Evaluate on validation set
    X_val_vec_ngram = vectorizer_ngram.transform(X_val)
    X_val_selected_ngram = selector_ngram.transform(X_val_vec_ngram)
    y_val_pred_ngram = model_ngram.predict(X_val_selected_ngram)
    f1_ngram = f1_score(y_val, y_val_pred_ngram, average='weighted')
    mse_ngram = mean_squared_error(y_val, y_val_pred_ngram)

    print(f"Validation F1-score: {f1_ngram:.4f}")
    print(f"Validation MSE: {mse_ngram:.4f}")

    # --- Select based purely on best MSE ---
    if mse_ngram < best_mse_ngram:
        # best_f1_ngram = f1_ngram
        best_mse_ngram = mse_ngram
        best_ngram = ngram
        print(f"New best parameters! ngram={ngram}")

# Use the best ngram found, or default if tuning failed
final_ngram = best_ngram if best_ngram is not None else (1, 2)

print(f"\nBest ngram parameters based on MSE:")
print(f"ngram: {final_ngram}")
# print(f"Best Validation F1-score (corresponding): {best_f1_ngram:.4f}")
print(f"Best Validation MSE: {best_mse_ngram:.4f}")



--- Tuning ngram_range (using best max_features and k, alpha=1.0) ---
Using max_features=20000, k=12000

Trying ngram=(1, 1)
Validation F1-score: 0.4456
Validation MSE: 0.8699

Trying ngram=(1, 2)
Validation F1-score: 0.4507
Validation MSE: 0.8522

Trying ngram=(1, 3)
Validation F1-score: 0.4483
Validation MSE: 0.8490

Best ngram parameters based on MSE:
ngram: (1, 3)
Best Validation MSE: 0.8490


In [8]:
# Block 7: Prepare Data with FINAL Best TF-IDF/KBest/Ngram Parameters
print("\n--- Applying best TF-IDF, KBest, Ngram parameters found --- ")
print(f"Using: max_features={current_max_features}, k={current_k}, ngram_range={final_ngram}")

vectorizer = TfidfVectorizer(
    max_features=current_max_features,
    ngram_range=final_ngram,
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)
# Fit on training data ONLY
X_train_vec = vectorizer.fit_transform(X_train)

selector = SelectKBest(chi2, k=current_k)
# Fit selector on training data ONLY
X_train_selected = selector.fit_transform(X_train_vec, y_train)

# Transform validation data using FITTED vectorizer and selector
X_val_vec = vectorizer.transform(X_val)
X_val_selected = selector.transform(X_val_vec)

# Scale numerical features (Fit on train, transform train and val)
scaler = StandardScaler()
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_val_scaled = scaler.transform(X_num_val)

# Combine features (No combined features used in Block 8's training loop)
# X_train_combined = hstack([X_train_selected, X_num_train_scaled])
# X_val_combined = hstack([X_val_selected, X_num_val_scaled])



--- Applying best TF-IDF, KBest, Ngram parameters found --- 
Using: max_features=20000, k=12000, ngram_range=(1, 3)


In [9]:
# Block 8: Hyperparameter Tuning for Ordinal Logistic Regression (alpha)
print("\n--- Tuning alpha using cross-validation on TF-IDF features ---")
# Note: Using ONLY the selected TF-IDF features (X_train_selected) for alpha tuning
# as combining with scaled numerical features wasn't used in the original alpha tuning logic.
# If you intended to use combined features, the loop below needs adjustment.

alpha_values = [0.01, 0.1, 1.0, 5.0, 10.0] # Adjusted range slightly
kf = KFold(n_splits=5, shuffle=True, random_state=42)
tuning_results = []

# Use KFold splits on the training data indices for alpha tuning
train_indices = list(range(len(X_train))) # Get indices for the original training set

for alpha in alpha_values:
    fold_acc = []
    fold_f1 = []
    fold_mse = []
    # Use KFold splits on the training set indices
    for fold_train_idx, fold_val_idx in kf.split(train_indices):
        # Select corresponding data from the *selected* training features
        X_fold_train = X_train_selected[fold_train_idx]
        X_fold_val = X_train_selected[fold_val_idx]
        # Select corresponding labels from y_train using original indices
        y_fold_train = y_train.iloc[fold_train_idx]
        y_fold_val = y_train.iloc[fold_val_idx]

        model = mord.LogisticAT(alpha=alpha)
        model.fit(X_fold_train, y_fold_train)
        y_val_pred = model.predict(X_fold_val) # Predict on the validation part of the fold

        fold_acc.append(accuracy_score(y_fold_val, y_val_pred))
        fold_f1.append(f1_score(y_fold_val, y_val_pred, average='weighted'))
        fold_mse.append(mean_squared_error(y_fold_val, y_val_pred))

    tuning_results.append({
        'alpha': alpha,
        'mean_accuracy': np.mean(fold_acc),
        'mean_f1': np.mean(fold_f1),
        'mean_mse': np.mean(fold_mse)
    })

# Print tuning results
for res in tuning_results:
    print(f"Alpha: {res['alpha']}, Mean Accuracy: {res['mean_accuracy']:.4f}, "
          f"Mean F1: {res['mean_f1']:.4f}, Mean MSE: {res['mean_mse']:.4f}")

# Select best alpha based on MSE from the cross-validation on the training set
best_alpha = min(tuning_results, key=lambda x: x['mean_mse'])['alpha']
print(f"\nBest alpha selected based on min MSE from training CV: {best_alpha}")

# --- Train final model using best_alpha on the full training set ---
print("\n--- Training final model on full training set with best alpha ---")
ord_model = mord.LogisticAT(alpha=best_alpha)
# Use the X_train_selected derived from the full training data with final vectorizer/selector
ord_model.fit(X_train_selected, y_train) # Train on the full X_train_selected



--- Tuning alpha using cross-validation on TF-IDF features ---
Alpha: 0.01, Mean Accuracy: 0.4248, Mean F1: 0.4252, Mean MSE: 1.1204
Alpha: 0.1, Mean Accuracy: 0.4549, Mean F1: 0.4533, Mean MSE: 0.9256
Alpha: 1.0, Mean Accuracy: 0.4672, Mean F1: 0.4550, Mean MSE: 0.8284
Alpha: 5.0, Mean Accuracy: 0.4404, Mean F1: 0.4133, Mean MSE: 0.8396
Alpha: 10.0, Mean Accuracy: 0.4151, Mean F1: 0.3842, Mean MSE: 0.8682

Best alpha selected based on min MSE from training CV: 1.0

--- Training final model on full training set with best alpha ---


In [10]:
# Block 9: Save the Trained Models and Preprocessors
with open(os.path.join(model_output_dir, 'ordinal_vectorizer.pkl'), 'wb') as f:
    pickle.dump(vectorizer, f)
with open(os.path.join(model_output_dir, 'ordinal_selector.pkl'), 'wb') as f:
    pickle.dump(selector, f)
# with open(os.path.join(model_output_dir, 'ordinal_scaler.pkl'), 'wb') as f:  # Scaler wasn't used in final combined features block
#     pickle.dump(scaler, f)
with open(os.path.join(model_output_dir, 'ordinal_classifier.pkl'), 'wb') as f:
    pickle.dump(ord_model, f)


In [11]:
# Block 10: Evaluate the Ordinal Logistic Regression Model on the Test Set
X_test_vec = vectorizer.transform(X_test)
X_test_selected = selector.transform(X_test_vec)
y_pred = ord_model.predict(X_test_selected)

print("\nOrdinal Logistic Regression Sentiment Analysis Evaluation:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Add MSE evaluation
mse = mean_squared_error(y_test, y_pred)
print(f"\nMean Squared Error: {mse:.4f}")

# Add per-class MSE
for i in range(1, 6):
    class_mask = y_test == i
    if any(class_mask):
        class_mse = mean_squared_error(y_test[class_mask], y_pred[class_mask])
        print(f"MSE for class {i}: {class_mse:.4f}")



Ordinal Logistic Regression Sentiment Analysis Evaluation:
              precision    recall  f1-score   support

           1       0.62      0.15      0.23       731
           2       0.56      0.56      0.56      2962
           3       0.31      0.46      0.37      1997
           4       0.51      0.59      0.55      3029
           5       0.57      0.17      0.26      1116

    accuracy                           0.47      9835
   macro avg       0.51      0.38      0.39      9835
weighted avg       0.50      0.47      0.46      9835


Confusion Matrix:
[[ 106  510  101   14    0]
 [  50 1644  951  317    0]
 [  10  452  921  606    8]
 [   4  285  829 1774  137]
 [   2   42  145  738  189]]

Mean Squared Error: 0.8448
MSE for class 1: 1.4227
MSE for class 2: 0.7660
MSE for class 3: 0.5658
MSE for class 4: 0.7072
MSE for class 5: 1.5484


In [12]:
# # Block 11: Perform Inference on All Processed Files and Save Results using Ordinal Logistic Regression
# for fname in os.listdir(processed_dir):
#     if fname.startswith('processed_r_') and fname.endswith('.json'):
#         file_path = os.path.join(processed_dir, fname)
#         with open(file_path, 'r', encoding='utf-8') as f:
#             df = pd.DataFrame(json.load(f))
#         if 'processed_tokens_ml' not in df.columns:
#             continue

#         # Prepare text features
#         texts = df['processed_tokens_ml'].apply(lambda toks: ' '.join(toks))
#         X_vec = vectorizer.transform(texts)
#         X_selected = selector.transform(X_vec)

#         # Get predictions
#         df['ordinal_sentiment'] = ord_model.predict(X_selected)

#         # Calculate prediction probabilities (if available)
#         try:
#             df['ordinal_sentiment_prob'] = np.max(ord_model.predict_proba(X_selected), axis=1)
#         except:
#             df['ordinal_sentiment_prob'] = None

#         # Save results
#         out_file = fname.replace('.json', '_ordinal_sentiment.csv')
#         out_path = os.path.join(results_output_dir, out_file)
#         df[['id', 'title', 'ordinal_sentiment']].to_csv(out_path, index=False) # Save relevant columns
#         print(f"Saved: {out_file} to {results_output_dir}")
