In [None]:
# Sentiment Analysis using AdaBoostClassifier

# This script implements sentiment analysis using AdaBoost with optimized hyperparameters.
# Key optimizations: Pipeline for tuning DecisionTreeClassifier parameters (max_depth), comprehensive hyperparameter grid, enhanced TF-IDF with bigrams, stratified split, cross-validation with parallel processing.

# Section 1: Setup and Library Imports
# ------------------------------------------- 
# Install required packages: pip install pandas numpy matplotlib seaborn wordcloud nltk scikit-learn ipython

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report,
                             roc_curve, auc)
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.pipeline import Pipeline
from IPython.display import display

# Download required NLTK data for text preprocessing
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

print("✅ All libraries imported successfully.")









In [None]:
# Section 2: Load Dataset
# ------------------------------------------------
# Load the 'train.csv' dataset from local file structure.
try:
    df = pd.read_csv('/content/train.csv', encoding='ISO-8859-1')
    print("\n✅ Dataset '/train.csv' loaded successfully.")
    display(df.head())
    print("\nDataset Info:")
    df.info()
except FileNotFoundError:
    print("\n❌ '/train.csv' not found. Please ensure the file is in the correct directory.")

In [None]:
# Section 3: Exploratory Data Analysis (EDA)
# -------------------------------------------
if 'df' in locals():
    # Perform extended EDA: Clean, add features.
    df.dropna(subset=['text', 'sentiment'], inplace=True)
    df['text_length'] = df['text'].astype(str).apply(len)
    df['word_count'] = df['text'].astype(str).apply(lambda x: len(re.findall(r'\w+', x)))

    print("\nStatistical Summary of New Features:")
    display(df[['text_length', 'word_count']].describe())

In [None]:
# Section 4: Advanced Visualizations (5 Required Types)
# ------------------------------------------------------
if 'df' in locals():
    print("\nGenerating visualizations...")
    # 1. Pie Chart: Sentiment Distribution
    plt.figure(figsize=(7, 7))
    df['sentiment'].value_counts().plot(kind='pie', autopct='%1.1f%%',
                                        colors=['skyblue', 'salmon', 'lightgray'],
                                        wedgeprops={'edgecolor': 'black'})
    plt.title('Pie Chart: Sentiment Distribution', fontsize=14)
    plt.ylabel('')
    plt.show()
    plt.savefig('sentiment_distribution.png')
    plt.close()
    print("Generated sentiment_distribution.png")

    # 2. Violin Plot: Text Length by Sentiment
    plt.figure(figsize=(10, 6))
    sns.violinplot(x='sentiment', y='text_length', data=df, palette=['skyblue', 'salmon', 'lightgray'])
    plt.title('Violin Plot: Text Length by Sentiment', fontsize=14)
    plt.xlabel('Sentiment')
    plt.ylabel('Text Length')
    plt.show()
    plt.savefig('text_length_violin.png')
    plt.close()
    print("Generated text_length_violin.png")

    # 3. Pair Plot: Correlations Between Features
    sns.pairplot(df[['text_length', 'word_count', 'sentiment']], hue='sentiment',
                 palette=['skyblue', 'salmon', 'lightgray'])
    plt.suptitle('Pair Plot: Text Features by Sentiment', y=1.02)
    plt.show()
    plt.savefig('pair_plot.png')
    plt.close()
    print("Generated pair_plot.png")

    # 4. KDE Plot: Density of Word Counts by Sentiment
    plt.figure(figsize=(12, 7))
    for sent in df['sentiment'].unique():
        sns.kdeplot(data=df[df['sentiment'] == sent], x='word_count', label=sent, fill=True, alpha=0.5)
    plt.title('KDE Plot: Density of Word Counts by Sentiment', fontsize=14)
    plt.legend()
    plt.show()
    plt.savefig('word_count_kde.png')
    plt.close()
    print("Generated word_count_kde.png")

    # 5. Word Clouds: Most Frequent Words per Sentiment
    for sent in df['sentiment'].unique():
        text = ' '.join(df[df['sentiment'] == sent]['text'].astype(str))
        wordcloud = WordCloud(width=800, height=400, background_color='white',
                              stopwords=STOPWORDS, collocations=False).generate(text)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'Word Cloud for: {sent.capitalize()} Sentiment', fontsize=16)
        plt.show()
        plt.savefig(f'word_cloud_{sent}.png')
        plt.close()
        print(f"Generated word_cloud_{sent}.png")

In [None]:

# Section 5: Data Preprocessing
# -----------------------------
if 'df' in locals():
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    def preprocess_text(text):
        text = str(text)
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#', '', text)
        text = re.sub(r'[^A-Za-z\s]', '', text)
        tokens = text.lower().split()
        clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
        return ' '.join(clean_tokens)

    print("\nPreprocessing text data...")
    df['cleaned_text'] = df['text'].apply(preprocess_text)
    print("✅ Preprocessing complete.")

    # Display a sample to verify
    print("\nSample Original Text:\n", df['text'].iloc[5])
    print("\nSample Cleaned Text:\n", df['cleaned_text'].iloc[5])

In [None]:

# Section 6: Feature Engineering and Data Splitting
# --------------------------------------------------
if 'df' in locals():
    X_text = df['cleaned_text']
    y_labels = df['sentiment']

    # TF-IDF vectorization with max_features and ngram_range
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
    X = vectorizer.fit_transform(X_text)

    le = LabelEncoder()
    y = le.fit_transform(y_labels)
    print("\nLabel mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

    # Stratified split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"\nData split into: \nTraining set shape: {X_train.shape} \nTesting set shape: {X_test.shape}")

In [None]:

# Section 7: Model Training - AdaBoost with Hyperparameter Tuning
# ----------------------------------------------------------------
if 'df' in locals():
    # Pipeline for tuning base_estimator params
    pipe = Pipeline([
        ('classifier', AdaBoostClassifier(estimator=DecisionTreeClassifier(), random_state=42))
    ])

    # Param grid: n_estimators, learning_rate, estimator max_depth
    param_grid = {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.1, 0.5, 1.0],
        'classifier__estimator__max_depth': [1, 2, 3]
    }

    grid_search = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
    print("\nStarting hyperparameter tuning with GridSearchCV...")
    grid_search.fit(X_train, y_train)
    print(f"\n✅ Tuning complete. Best parameters found: {grid_search.best_params_}")

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)


In [None]:


# Section 8: Model Evaluation
# -----------------------------------------------------------------
if 'df' in locals():
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
    roc_auc = roc_auc_score(y_test_bin, y_pred_proba, multi_class='ovr')

    print("\n--- Model Performance Metrics ---")
    print(f"1. Accuracy: {accuracy:.4f}")
    print(f"2. Precision (Macro): {precision:.4f}")
    print(f"3. Recall (Macro): {recall:.4f}")
    print(f"4. F1-Score (Macro): {f1:.4f}")
    print(f"5. ROC-AUC (One-vs-Rest): {roc_auc:.4f}")

    print("\n--- Classification Report ---")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

    # Confusion Matrix Heatmap
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title('Confusion Matrix', fontsize=14)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()
    plt.savefig('confusion_matrix.png')
    plt.close()
    print("Generated confusion_matrix.png")

    # ROC Curves (One-vs-Rest)
    plt.figure(figsize=(10, 8))
    colors = ['aqua', 'darkorange', 'cornflowerblue']
    for i, color, class_name in zip(range(len(le.classes_)), colors, le.classes_):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
        roc_auc_val = auc(fpr, tpr)
        plt.plot(fpr, tpr, color=color, lw=2,
                 label=f'ROC curve for {class_name} (area = {roc_auc_val:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Multi-Class ROC Curves (One-vs-Rest)', fontsize=14)
    plt.legend(loc='lower right')
    plt.show()
    plt.savefig('roc_curves.png')
    plt.close()
    print("Generated roc_curves.png")

    print("\n--- End of Notebook ---")