In [1]:
# Block 1: Import Libraries
import os
import json
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, f1_score, mean_squared_error


In [2]:
# Block 2: Define Paths for Data and Output

# Define the root path of your project on Google Drive
project_root = '../../'

# Define the path to your processed lexicon data
processed_dir = os.path.join(project_root, 'Data', 'Historical Reddit', 'Lexicon_Data')

# Define the path to the golden dataset
golden_path = os.path.join(project_root, 'Data', 'Historical Reddit', 'golden_dataset_sentiment.csv')

# Define the base output directory structure (this seems fine)
base_output_dir = os.path.join(project_root, 'outputs', 'sentiment_analysis', 'lexicon_based')
results_output_dir = os.path.join(base_output_dir, 'results') # For per-subreddit CSVs
# evaluation_output_dir = os.path.join(base_output_dir, 'evaluations') # Optional: If you save evaluations later

print(f"Project root: {project_root}")
print(f"Looking for lexicon data in: {processed_dir}")
print(f"Using golden dataset from: {golden_path}")
print(f"Saving outputs to: {base_output_dir}")

# Create all necessary output directories if they don't exist
os.makedirs(results_output_dir, exist_ok=True)
# os.makedirs(evaluation_output_dir, exist_ok=True) # Uncomment if needed later


Project root: ../../
Looking for lexicon data in: ../../Data\Historical Reddit\Lexicon_Data
Using golden dataset from: ../../Data\Historical Reddit\golden_dataset_sentiment.csv
Saving outputs to: ../../outputs\sentiment_analysis\lexicon_based


In [3]:
# Block 3: Load the Loughran-McDonald Master Dictionary and Prepare Lexicons
lexicon_path = os.path.join(project_root, 'Documents', 'Dictionaries', 'Loughran-McDonald_MasterDictionary_1993-2024.csv')
lexicon_df = pd.read_csv(lexicon_path)

# Create binary flags for sentiment (flag as 1 if count > 0)
lexicon_df['Negative_flag'] = lexicon_df['Negative'].apply(lambda x: 1 if pd.notnull(x) and float(x) > 0 else 0)
lexicon_df['Positive_flag'] = lexicon_df['Positive'].apply(lambda x: 1 if pd.notnull(x) and float(x) > 0 else 0)

# Build sets of negative and positive words (lowercase for consistency)
negative_words = set(lexicon_df.loc[lexicon_df['Negative_flag'] == 1, 'Word'].str.lower())
positive_words = set(lexicon_df.loc[lexicon_df['Positive_flag'] == 1, 'Word'].str.lower())

postive_jargons = [
    # 🚀 Bullish Sentiment / Positive
    "diamond_hands", "hodl", "hodler", "wagmi", "ape_in", "btfd", "buy_the_dip",
    "go_long", "long_it", "going_long", "bull_run", "bullish", "long_call",
    "long_position", "long_stock", "to_the_moon", "moonshot", "ath", "stonks",
    "dca", "flippening", "wen_lambo", "bear_trap"
]

negative_jargons = [
    # 😬 Bearish Sentiment / Negative
    "paper_hands", "bagholder", "ngmi", "rekt", "gn", "bearish", "long_put",
    "exit_scam", "shill", "rug_pull", "pump_and_dump", "margin_call", "liquidated",
    "overleveraged", "capitulate", "bull_trap", "short_position", "short_sell",
    "shorting", "shorts", "shorted", "short_interest", "short_call", "short_it",
    "short_selling"
]

positive_words.update(postive_jargons)
negative_words.update(negative_jargons)

print("Number of negative words:", len(negative_words))
print("Number of positive words:", len(positive_words))


Number of negative words: 2369
Number of positive words: 370


In [4]:
# Block 4: Define Functions for Lexicon-Based Sentiment Analysis with Calibration

negation_words = {"not", "no", "never", "without", "against", "cannot", "don't", "isn't", "wasn't", "shouldn't", "wouldn't", "couldn't", "won't", "shan't", "haven't", "hasn't", "hadn't", "doesn't", "didn't"}

def raw_lexicon_score(tokens, pos_set, neg_set):
    pos_count = sum(1 for token in tokens if token.lower() in pos_set)
    neg_count = sum(1 for token in tokens if token.lower() in neg_set)
    return pos_count - neg_count

def enhanced_raw_lexicon_score(tokens, pos_set, neg_set, negation_words):
    """
    Calculates a raw sentiment score, checking for negation words
    within a small window (up to 3 words) preceding a sentiment word.
    """
    score = 0
    negation_window = 3 # How many words back to check for negation

    for i, token in enumerate(tokens):
        lower_token = token.lower()
        is_negation_word_present = False

        # Check the window before the current token for negation words
        # Start checking from index i-1 down to max(0, i-negation_window)
        for j in range(max(0, i - negation_window), i):
            if tokens[j].lower() in negation_words:
                is_negation_word_present = not is_negation_word_present # Flip negation state for each negation word found (simple double negative handling)

        # Calculate score based on current token and negation status
        if lower_token in pos_set:
            score += -1 if is_negation_word_present else 1
        elif lower_token in neg_set:
            score += 1 if is_negation_word_present else -1 # Flip score if negated

        # Note: This version doesn't explicitly track consecutive negation words
        # or complex grammatical structures, but expands the negation check window.

    return score

def lexicon_sentiment(tokens, pos_set, neg_set, negation_words, min_score, max_score):
        # Get the raw sentiment score using the enhanced function.
        raw_score = enhanced_raw_lexicon_score(tokens, pos_set, neg_set, negation_words)

        # Clamp the raw score within the provided calibration range.
        clamped_score = max(min(raw_score, max_score), min_score)

        # Handle the case where min_score equals max_score to avoid division by zero
        if max_score == min_score:
             normalized = 0.5 # Assign neutral value or handle as appropriate
        else:
             normalized = (clamped_score - min_score) / (max_score - min_score)

        # Map the normalized value to a 1-5 rating.
        rating = round(normalized * 4) + 1
        return rating, raw_score


In [5]:
# Block 5: Evaluation on Golden Dataset and Tuning Calibration Hyperparameters

# Load the golden dataset with 5-class sentiment labels
golden = pd.read_csv(golden_path)

# Load and combine all relevant processed JSON files
dfs = []
for fname in os.listdir(processed_dir):
    if fname.startswith('lexicon_r_') and fname.endswith('.json'):
        file_path = os.path.join(processed_dir, fname)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = json.load(f)
                if isinstance(content, list) and len(content) > 0:
                    df = pd.DataFrame(content)
                    if 'id' in df.columns and 'processed_tokens_lexicon' in df.columns:
                        dfs.append(df)
        except Exception:
            continue

if not dfs:
    raise ValueError("No valid processed dataframes found.")

all_posts = pd.concat(dfs, ignore_index=True)
all_posts['id'] = all_posts['id'].astype(str)
golden['id'] = golden['id'].astype(str)

# Merge with golden sentiment labels
data = all_posts.merge(golden[['id', 'sentiment']], on='id', how='inner')
data['processed_tokens_lexicon'] = data['processed_tokens_lexicon'].apply(lambda x: x if isinstance(x, list) else [])

# Compute raw lexicon score using enhanced scoring function
data['raw_score'] = data['processed_tokens_lexicon'].apply(
    lambda toks: enhanced_raw_lexicon_score(toks, positive_words, negative_words, negation_words)
)

def tune_calibration_percentiles(data, pos_set, neg_set, negation_words, lowers, uppers):
    """
    Grid search to find the best lower and upper percentiles for calibration.
    Returns best_lower, best_upper, best_f1.
    """
    best_f1 = -1
    best_lower = best_upper = None

    for lower in lowers:
        for upper in uppers:
            if lower >= upper:
                continue
            min_val = data['raw_score'].quantile(lower / 100)
            max_val = data['raw_score'].quantile(upper / 100)

            preds = data['processed_tokens_lexicon'].apply(
                lambda toks: lexicon_sentiment(toks, pos_set, neg_set, negation_words, min_val, max_val)[0]
            )

            f1 = f1_score(data['sentiment'], preds, average='weighted')
            if f1 > best_f1:
                best_f1, best_lower, best_upper = f1, lower, upper

    return best_lower, best_upper, best_f1

candidate_lowers = [1, 3, 5, 7, 10]
candidate_uppers = [90, 93, 95, 97, 99]

best_lower, best_upper, best_f1 = tune_calibration_percentiles(
    data, positive_words, negative_words, negation_words, candidate_lowers, candidate_uppers
)

if best_lower is None or best_upper is None:
    print("Tuning failed. Using fallback percentiles: 5% and 95%")
    best_lower, best_upper = 5, 95
    min_val = data['raw_score'].quantile(best_lower / 100)
    max_val = data['raw_score'].quantile(best_upper / 100)
    preds = data['processed_tokens_lexicon'].apply(
        lambda toks: lexicon_sentiment(toks, positive_words, negative_words, negation_words, min_val, max_val)[0]
    )
    best_f1 = f1_score(data['sentiment'], preds, average='weighted')

print(f"Best lower percentile: {best_lower}%")
print(f"Best upper percentile: {best_upper}%")
print(f"Best weighted F1 Score: {best_f1:.4f}")

min_score_calib = data['raw_score'].quantile(best_lower / 100)
max_score_calib = data['raw_score'].quantile(best_upper / 100)
print("Calibration Parameters:")
print(f"Min Score (calibrated): {min_score_calib:.4f}")
print(f"Max Score (calibrated): {max_score_calib:.4f}")

def apply_lexicon_sentiment(row, min_score, max_score):
    tokens = row['processed_tokens_lexicon'] if isinstance(row['processed_tokens_lexicon'], list) else []
    rating, raw_score = lexicon_sentiment(tokens, positive_words, negative_words, negation_words, min_score, max_score)
    return pd.Series({'lex_sentiment': rating, 'lex_raw_score': raw_score})

data[['lex_sentiment', 'lex_raw_score']] = data.apply(
    lambda row: apply_lexicon_sentiment(row, min_score_calib, max_score_calib), axis=1
)

# Final evaluation
print("\nLexicon-Based Sentiment Analysis Evaluation:")
print(classification_report(data['sentiment'], data['lex_sentiment']))
print("Confusion Matrix:")
print(confusion_matrix(data['sentiment'], data['lex_sentiment']))
print("\nMean Squared Error:", mean_squared_error(data['sentiment'], data['lex_sentiment']))


Best lower percentile: 7%
Best upper percentile: 95%
Best weighted F1 Score: 0.3234
Calibration Parameters:
Min Score (calibrated): -6.0000
Max Score (calibrated): 5.0000

Lexicon-Based Sentiment Analysis Evaluation:
              precision    recall  f1-score   support

           1       0.21      0.31      0.25      3613
           2       0.45      0.26      0.33     14914
           3       0.25      0.46      0.32      9968
           4       0.40      0.33      0.36     15114
           5       0.30      0.21      0.25      5566

    accuracy                           0.32     49175
   macro avg       0.32      0.32      0.30     49175
weighted avg       0.36      0.32      0.32     49175

Confusion Matrix:
[[1117 1092 1019  341   44]
 [2495 3927 5730 2370  392]
 [ 768 1516 4608 2622  454]
 [ 895 1824 5556 4980 1859]
 [ 155  419 1635 2172 1185]]

Mean Squared Error: 1.5643924758515506


In [6]:
# # Block 6: Process Each JSON File and Save Calibrated Lexicon Sentiment Results

# for fname in os.listdir(processed_dir):
#     if fname.startswith('processed_r_') and fname.endswith('.json'):
#         file_path = os.path.join(processed_dir, fname)
#         with open(file_path, 'r', encoding='utf-8') as infile:
#             df = pd.DataFrame(json.load(infile))
#         if 'processed_tokens_lexicon' not in df.columns:
#             continue

#         # Apply the calibrated lexicon sentiment function to each row.
#         df[['lex_sentiment', 'lex_raw_score']] = df.apply(lambda row: apply_lexicon_sentiment(row, min_score_calib, max_score_calib), axis=1)

#         # --- Use the new output directory ---
#         out_file = fname.replace('.json', '_lexicon_sentiment.csv')
#         out_path = os.path.join(results_output_dir, out_file) # Use results_output_dir defined in Block 3
#         df.to_csv(out_path, index=False)
#         print(f"Saved: {out_file} to {results_output_dir}")

# print("\nFinished saving lexicon sentiment results per subreddit.")
