In [1]:
# Block 1: Import Libraries
import os
import json
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, mean_squared_error


In [2]:
# Block 2: Define Paths for Data and Output

# Define the root path of your project on Google Drive
project_root = '../../'

# Define the path to your processed data
processed_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'ML_Data')

# Define the path to the golden dataset
golden_path = os.path.join(project_root, 'Data', 'Historical Reddit', 'golden_dataset_sentiment.csv')

# Define the base output directory structure within the project root
base_output_dir = os.path.join(project_root, 'outputs', 'sentiment_analysis', 'logistic_regression')
model_output_dir = os.path.join(base_output_dir, 'models')
results_output_dir = os.path.join(base_output_dir, 'results')   # For saving per-subreddit predictions
# evaluation_output_dir = os.path.join(base_output_dir, 'evaluations') # Optional: If you save evaluations

print(f"Project root (Google Drive): {project_root}")
print(f"Looking for data in: {processed_dir}")
print(f"Using golden dataset from: {golden_path}")
print(f"Saving outputs to: {base_output_dir}") # Base output path for Logistic Regression Sentiment

# Create all necessary output directories if they don't exist
os.makedirs(model_output_dir, exist_ok=True)
os.makedirs(results_output_dir, exist_ok=True)
# os.makedirs(evaluation_output_dir, exist_ok=True) # Uncomment if needed later


Project root (Google Drive): ../../
Looking for data in: ../../Data\Historical Reddit\ML_Data
Using golden dataset from: ../../Data\Historical Reddit\golden_dataset_sentiment_non_batch.csv
Saving outputs to: ../../outputs\sentiment_analysis\logistic_regression


In [3]:
# Block 3: Load and Merge Data from JSON Files and the Golden Dataset
dfs = []
for fname in os.listdir(processed_dir):
    if fname.startswith('ml_r_') and fname.endswith('.json'):
        file_path = os.path.join(processed_dir, fname)
        with open(file_path, 'r', encoding='utf-8') as f:
            data_json = json.load(f)
        df = pd.DataFrame(data_json)
        dfs.append(df)
all_posts = pd.concat(dfs, ignore_index=True)

# Load the golden dataset that contains sentiment labels
golden = pd.read_csv(golden_path)
data = all_posts.merge(golden[['id', 'sentiment']], on='id', how='inner')


In [4]:
# Block 4: Preprocess Data for Machine Learning
data['text'] = data['processed_tokens_ml'].apply(lambda toks: ' '.join(toks))
X = data['text']
y = data['sentiment']


In [5]:
# Block 5: Split Data into Training and Test Sets
# First split: 80% train+val, 20% test
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Second split: Split train+val into train and val
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, stratify=y_trainval, test_size=0.2, random_state=42
)


In [6]:
# Block 6: Tune ngram parameter
# Define parameter grid for ngram tuning
ngram_values = [(1,1), (1,2), (1,3), (2,2), (2,3)]

# Initialize best parameters
best_ngram = None # Initialize to None
best_f1 = -1.0 # Initialize to a value lower than any possible F1 score
best_mse = float('inf') # MSE is no longer the primary driver for selection

# Tune ngram parameter
for ngram in ngram_values:
    print(f"\nTrying ngram={ngram}")

    # Initialize vectorizer with current parameters
    vectorizer = TfidfVectorizer(
        max_features=10000,  # Keep your current max_features
        ngram_range=ngram,
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )

    # Transform training data
    X_train_vec = vectorizer.fit_transform(X_train)

    # Train a simple model for evaluation
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_vec, y_train)

    # Evaluate on validation set
    X_val_vec = vectorizer.transform(X_val)
    y_val_pred = clf.predict(X_val_vec)
    f1 = f1_score(y_val, y_val_pred, average='weighted')
    mse = mean_squared_error(y_val, y_val_pred) # Still calculate MSE for info

    print(f"Validation F1-score: {f1:.4f}")
    print(f"Validation MSE: {mse:.4f}") # Keep printing MSE for comparison

    # --- Select based purely on best F1 score ---
    if f1 > best_f1:
        best_f1 = f1
        best_mse = mse # Update mse too, just for reporting the corresponding MSE if needed
        best_ngram = ngram
        print(f"New best parameters! ngram={ngram}")

print(f"\nBest parameters based on Weighted F1:")
print(f"ngram: {best_ngram}")
print(f"Best Validation F1-score: {best_f1:.4f}")
print(f"Validation MSE for best F1: {best_mse:.4f}") # Optional: report corresponding MSE



Trying ngram=(1, 1)
Validation F1-score: 0.4959
Validation MSE: 1.0572
New best parameters! ngram=(1, 1)

Trying ngram=(1, 2)
Validation F1-score: 0.5004
Validation MSE: 1.0352
New best parameters! ngram=(1, 2)

Trying ngram=(1, 3)
Validation F1-score: 0.5034
Validation MSE: 1.0330
New best parameters! ngram=(1, 3)

Trying ngram=(2, 2)
Validation F1-score: 0.4002
Validation MSE: 1.4867

Trying ngram=(2, 3)
Validation F1-score: 0.4004
Validation MSE: 1.4933

Best parameters based on Weighted F1:
ngram: (1, 3)
Best Validation F1-score: 0.5034
Validation MSE for best F1: 1.0330


In [7]:
# Block 7: Vectorize Text Data and Train the Logistic Regression Classifier
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))
clf = LogisticRegression(max_iter=1000)
X_train_vec = vectorizer.fit_transform(X_train)
clf.fit(X_train_vec, y_train)


In [8]:
# Block 8: Save the Trained Model and Vectorizer
with open(os.path.join(model_output_dir, 'logreg_classifier.pkl'), 'wb') as f:
    pickle.dump(clf, f)

with open(os.path.join(model_output_dir, 'logreg_vectorizer.pkl'), 'wb') as f:
    pickle.dump(vectorizer, f)

print("\nModel and vectorizer saved successfully.")



Model and vectorizer saved successfully.


In [9]:
# Block 9: Evaluate the Model on the Test Set
X_test_vec = vectorizer.transform(X_test)
preds = clf.predict(X_test_vec)
print("\nLogistic Regression Sentiment Analysis Evaluation:")
print(classification_report(y_test, preds))
print("Confusion Matrix:")
print(confusion_matrix(y_test, preds))

# Add MSE evaluation
mse = mean_squared_error(y_test, preds)
print("\nMean Squared Error:", mse)



Logistic Regression Sentiment Analysis Evaluation:
              precision    recall  f1-score   support

           1       0.63      0.25      0.36       731
           2       0.53      0.67      0.59      2962
           3       0.48      0.35      0.41      1997
           4       0.51      0.63      0.56      3029
           5       0.57      0.30      0.40      1116

    accuracy                           0.52      9835
   macro avg       0.54      0.44      0.46      9835
weighted avg       0.52      0.52      0.50      9835

Confusion Matrix:
[[ 184  482    9   47    9]
 [  76 1975  340  530   41]
 [  11  585  695  684   22]
 [  14  576  346 1910  183]
 [   7  127   43  600  339]]

Mean Squared Error: 1.05510930350788


In [10]:
# # Block 10: Perform Inference on All Processed Files and Save Results
# for fname in os.listdir(processed_dir):
#     if fname.startswith('processed_r_') and fname.endswith('.json'):
#         file_path = os.path.join(processed_dir, fname)
#         with open(file_path, 'r', encoding='utf-8') as f:
#             df = pd.DataFrame(json.load(f))
#         if 'processed_tokens_ml' not in df.columns:
#             continue

#         texts = df['processed_tokens_ml'].apply(lambda toks: ' '.join(toks))
#         X_vec = vectorizer.transform(texts)
#         df['ml_sentiment'] = clf.predict(X_vec)
#         df['ml_score'] = clf.predict_proba(X_vec).max(axis=1)

#         out_file = fname.replace('.json', '_logreg_sentiment.csv')
#         out_path = os.path.join(results_output_dir, out_file)
#         df.to_csv(out_path, index=False)
#         print(f"Saved: {out_file} to {results_output_dir}")
