<a href="https://colab.research.google.com/github/Srija-Lattala/Threads_Sentiment_Analysis/blob/main/threads_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# sentiment_training.py
import pandas as pd
import numpy as np
import re
import string
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

# Download NLTK resources
# Download the 'punkt_tab' resource
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the punkt_tab resource


def preprocess_text(text):
    if not isinstance(text, str):
        return ''

    text = text.lower()
    text = emoji.demojize(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    return ' '.join(tokens)


def score_to_sentiment(score):
    if score >= 4:
        return 'positive'
    elif score <= 2:
        return 'negative'
    else:
        return 'neutral'

def main():
    # Load data
    df = pd.read_csv('threads_comments.csv')

    # Convert scores to sentiment
    df['sentiment'] = df['score'].apply(score_to_sentiment)

    # Preprocess comments
    print("Preprocessing comments...")
    df['cleaned_content'] = df['content'].apply(preprocess_text)

    # Encode labels
    le = LabelEncoder()
    df['sentiment_encoded'] = le.fit_transform(df['sentiment'])

    # Split data
    X = df['cleaned_content']
    y = df['sentiment_encoded']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y  # Maintain class distribution
    )

    # TF-IDF Vectorizer with optimization for large datasets
    tfidf = TfidfVectorizer(
        max_features=25000,
        ngram_range=(1, 3),
        min_df=5,
        max_df=0.85,
        stop_words='english'
    )

    # Initialize models with optimized parameters
    models = {
        'XGBoost': XGBClassifier(
            n_estimators=300,
            max_depth=5,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric='mlogloss',
            use_label_encoder=False,
            n_jobs=-1
        ),
        'Logistic Regression': LogisticRegression(
            C=0.5,
            max_iter=1000,
            class_weight='balanced',
            n_jobs=-1
        ),
        'Random Forest': RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            class_weight='balanced',
            n_jobs=-1
        )
    }

    best_accuracy = 0
    best_model = None

    for name, model in models.items():
        print(f"\nTraining {name}...")
        pipeline = Pipeline([
            ('tfidf', tfidf),
            ('clf', model)
        ])

        pipeline.fit(X_train, y_train)

        # Evaluate
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{name} Accuracy: {accuracy:.4f}")
        print(classification_report(y_test, y_pred))

        # Track best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = pipeline

    # Save best model and label encoder
    print(f"\nSaving best model with accuracy: {best_accuracy:.4f}")
    joblib.dump(best_model, 'best_sentiment_model.pkl')
    joblib.dump(le, 'label_encoder.pkl')

if __name__ == '__main__':
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Preprocessing comments...

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.7948
              precision    recall  f1-score   support

           0       0.81      0.60      0.69      1837
           1       0.25      0.02      0.03       417
           2       0.79      0.96      0.87      4146

    accuracy                           0.79      6400
   macro avg       0.62      0.53      0.53      6400
weighted avg       0.76      0.79      0.76      6400


Training Logistic Regression...
Logistic Regression Accuracy: 0.7228
              precision    recall  f1-score   support

           0       0.73      0.66      0.69      1837
           1       0.13      0.29      0.18       417
           2       0.86      0.80      0.83      4146

    accuracy                           0.72      6400
   macro avg       0.57      0.58      0.57      6400
weighted avg       0.78      0.72      0.75      6400


Training Random Forest...
Random Forest Accuracy: 0.7325
              precision    recall  f1-score   support

           0       0.68      0