In [None]:
# ============================================
# ‚öôÔ∏è BLOCK 1: SETUP & LIBRARIES
# ============================================
import nltk
print("Downloading all NLTK packages... (This may take a few minutes)")
nltk.download('all')

import pandas as pd
import re
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Import scikit-learn modules for all four models
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB # ADDED NAIVE BAYES
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Connect to Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')

print("‚úÖ Setup complete. Libraries imported and Drive mounted.")


# ============================================
# üßπ BLOCK 2: DATA LOADING & PREPARATION
# ============================================
# --- Load First Dataset (Reddit) ---
reddit_path = "/content/drive/MyDrive/Dataset/Reddit_Data.csv"
print(f"\nLoading Reddit dataset from: {reddit_path}")
df_reddit = pd.read_csv(reddit_path)
df_reddit.rename(columns={'clean_comment': 'text', 'category': 'sentiment'}, inplace=True)

# --- Load Second Dataset (Twitter) ---
twitter_path = "/content/drive/MyDrive/Dataset/Twitter_Data.csv"
print(f"Loading Twitter dataset from: {twitter_path}")
df_twitter = pd.read_csv(twitter_path)
df_twitter.rename(columns={'clean_text': 'text', 'category': 'sentiment'}, inplace=True)


# --- Combine and Standardize the Datasets ---
df = pd.concat([df_reddit, df_twitter], ignore_index=True)

# Map numeric sentiment labels to words
sentiment_map = {-1: 'Negative', 0: 'Neutral', 1: 'Positive'}
df['sentiment'] = df['sentiment'].map(sentiment_map)

# Clean up the combined DataFrame
df.dropna(subset=['sentiment', 'text'], inplace=True)
df.drop_duplicates(subset=["text"], inplace=True)


# Define the text cleaning function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+|#\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(text)
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(cleaned_tokens)

print("Cleaning combined text data... (This may take a moment)")
df['cleaned_text'] = df['text'].apply(clean_text)

print(f"‚úÖ Data loaded and cleaned. Total samples: {len(df)}")


# ============================================
# üöÄ BLOCK 3: TRAIN & COMPARE ALL MODELS
# ============================================
X = df['cleaned_text']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Define the models ---
vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1, 2))

# Model 1: Logistic Regression
logreg_model = Pipeline([
    ('tfidf', vectorizer),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Model 2: Random Forest
rf_model = Pipeline([
    ('tfidf', vectorizer),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
])

# Model 3: Gradient Boosting
gb_model = Pipeline([
    ('tfidf', vectorizer),
    ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42))
])

# Model 4: Naive Bayes (NEWLY ADDED)
nb_model = Pipeline([
    ('tfidf', vectorizer),
    ('classifier', MultinomialNB())
])


models = {
    "Logistic Regression": logreg_model,
    "Random Forest": rf_model,
    "Gradient Boosting": gb_model,
    "Naive Bayes": nb_model  # ADDED TO THE COMPARISON
}

best_model = None
best_accuracy = 0
best_model_name = ""

# --- Train and evaluate each model ---
for name, model in models.items():
    print(f"\n--- Training {name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"‚úÖ {name} Accuracy: {accuracy:.4f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_model_name = name

print(f"\nüèÜ The Best Model is: {best_model_name} with an accuracy of {best_accuracy:.4f}")

# ============================================
# üìä BLOCK 4: DETAILED REPORT FOR THE WINNER
# ============================================
print(f"\n--- Detailed Report for {best_model_name} ---")
final_predictions = best_model.predict(X_test)
print(classification_report(y_test, final_predictions, zero_division=0))


# ============================================
# üíæ BLOCK 5: SAVE THE CHAMPION MODEL
# ============================================
save_path = f"/content/drive/MyDrive/best_sentiment_model_final.joblib"
joblib.dump(best_model, save_path)
print(f"\n‚úÖ Champion model saved successfully to your Google Drive at: {save_path}")

Downloading all NLTK packages... (This may take a few minutes)


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

Mounted at /content/drive
‚úÖ Setup complete. Libraries imported and Drive mounted.

Loading Reddit dataset from: /content/drive/MyDrive/Dataset/Reddit_Data.csv
Loading Twitter dataset from: /content/drive/MyDrive/Dataset/Twitter_Data.csv
Cleaning combined text data... (This may take a moment)
‚úÖ Data loaded and cleaned. Total samples: 199708

--- Training Logistic Regression ---
‚úÖ Logistic Regression Accuracy: 0.8836

--- Training Random Forest ---
‚úÖ Random Forest Accuracy: 0.8469

--- Training Gradient Boosting ---
‚úÖ Gradient Boosting Accuracy: 0.7198

--- Training Naive Bayes ---
‚úÖ Naive Bayes Accuracy: 0.7209

üèÜ The Best Model is: Logistic Regression with an accuracy of 0.8836

--- Detailed Report for Logistic Regression ---
              precision    recall  f1-score   support

    Negative       0.87      0.76      0.81      8751
     Neutral       0.86      0.96      0.91     13592
    Positive       0.91      0.89      0.90     17599

    accuracy                   