In [1]:
from typing import List, Union

import nltk #Natural Language Toolkit
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
## Process data
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import re
import string
## model
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline



In [2]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
all_positive_tweets_sen = twitter_samples.strings('positive_tweets.json')
all_negative_tweets_sen = twitter_samples.strings('negative_tweets.json')

In [4]:
def process_tweet(tweet: str) -> List[str]:
    """
    Processes a tweet by cleaning, tokenizing, and stemming the words.

    Parameters:
    - tweet: A string representing a tweet.

    Returns:
    - tweets_clean: A list of words containing the processed tweet.
    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtags (only removing the hash # sign)
    tweet = re.sub(r'#', '', tweet)

    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if word not in stopwords_english and word not in string.punctuation:  # remove stopwords and punctuation
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)

    return tweets_clean

# Assuming all_positive_tweets_sen and all_negative_tweets_sen are lists of tweet strings
all_positive_tweets = [process_tweet(tweet) for tweet in all_positive_tweets_sen]
all_negative_tweets = [process_tweet(tweet) for tweet in all_negative_tweets_sen]

In [5]:
len(all_positive_tweets)

5000

In [6]:
len(all_negative_tweets)

5000

In [7]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [8]:
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [9]:
# Print the shape train and test sets
print("train_y.shape = " + str(y_train.shape))
print("test_y.shape = " + str(y_test.shape))

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


In [10]:
X_train = [' '.join(tokens) for tokens in train_x]
X_test = [' '.join(tokens) for tokens in test_x]

In [11]:
def error_analysis(pipeline: Pipeline, 
                   X_test: Union[np.ndarray, List[str]], 
                   y_test: np.ndarray, 
                   sentences: List[str]) -> pd.DataFrame:
    """
    Perform error analysis on the model's predictions using the entire pipeline.

    Parameters:
    - pipeline: The trained pipeline containing vectorizer and model.
    - X_test: The test features (original text data).
    - y_test: The true labels for the test set.
    - sentences: List or array containing the original sentences or data points.

    Returns:
    - DataFrame with columns: ['sentence', 'predicted_class', 'real_class']
      showing the misclassified data points.
    """
    # Make predictions using the pipeline
    y_pred = pipeline.predict(X_test)

    # Create a DataFrame for error analysis
    df_errors = pd.DataFrame({
        'sentence': sentences,
        'predicted_class': y_pred,
        'real_class': y_test.flatten()  # Ensure this is flattened
    })

    # Filter rows where the prediction is incorrect
    df_errors = df_errors[df_errors['predicted_class'] != df_errors['real_class']]

    return df_errors

In [12]:
# Store error DataFrames for all models
all_errors = {}

#Model
models = {
    'Logistic Regression': LogisticRegression(),
    'SVC': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB(),
    'LightGBM': lgb.LGBMClassifier()
}
for model_name, model in models.items():
    # Create the pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(tokenizer=lambda x: x.split(), sublinear_tf=True)),
        ('classifier', model)
    ])
    # Fit the model
    pipeline.fit(X_train, y_train)

    # Predictions and classification report
    y_pred = pipeline.predict(X_test)
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))

    # Perform error analysis
    errors = error_analysis(pipeline, X_test, y_test, test_x)  # Pass test_x as the sentences
    all_errors[model_name] = errors  # Store the errors for each model


  y = column_or_1d(y, warn=True)


Logistic Regression Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      1000
         1.0       1.00      1.00      1.00      1000

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



  y = column_or_1d(y, warn=True)


SVC Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1000
         1.0       1.00      1.00      1.00      1000

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



  return fit_method(estimator, *args, **kwargs)


Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      1000
         1.0       0.99      1.00      1.00      1000

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

Naive Bayes Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96      1000
         1.0       0.96      0.96      0.96      1000

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 4000, number of negative: 4000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010174 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9218
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 407
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      1000
         1.0       0.99      1.00      1.00      1000

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



In [13]:
#Loading the error
logistic_errors = all_errors['Logistic Regression']
logistic_errors 

Unnamed: 0,sentence,predicted_class,real_class
753,"[park, get, sunlight]",0.0,1.0
1298,"[u, prob, fun, david]",1.0,0.0
1544,"[pat, jay]",1.0,0.0
1756,"[belov, grandmoth]",1.0,0.0
1773,"[that, life, get, call, peopl, havent, seen, 2...",1.0,0.0
1853,"[sr, financi, analyst, expedia, inc, bellevu, ...",1.0,0.0


# Tuning

In [14]:
def error_analysis_tuning(model, 
                          X_test: Union[np.ndarray, List[str]], 
                          y_test: np.ndarray, 
                          sentences: List[str]) -> pd.DataFrame:
    """
    Perform error analysis on the model's predictions.

    Parameters:
    - model: The trained model to evaluate.
    - X_test: The test features.
    - y_test: The true labels for the test set.
    - sentences: List or array containing the original sentences or data points.

    Returns:
    - DataFrame with columns: ['sentence', 'predicted_class', 'real_class']
      showing the misclassified data points.
    """
    # Make predictions
    y_pred = model.predict(X_test).tolist()
    y_pred = [int(pred) for pred in y_pred]
    y_test = y_test.flatten().astype(int).tolist()
    
    # Create a DataFrame for error analysis
    df_errors = pd.DataFrame({
        'sentence': sentences,        # Sentences or data points
        'predicted_class': y_pred,    # Model predictions
        'real_class': y_test          # True labels
    })

    # Filter rows where the prediction is incorrect
    df_errors = df_errors[df_errors['predicted_class'] != df_errors['real_class']]

    return df_errors

In [15]:
rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=lambda x: x.split(), sublinear_tf=True)),
    ('classifier', RandomForestClassifier(random_state=42))
])
rf_param_grid = {
    'classifier__n_estimators': [100, 200, 300, 400, 500],
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__max_depth': [None, 10, 20, 30, 40, 50],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__bootstrap': [True, False]
}

rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
# Best parameters and score for Random Forest
print("Best parameters for Random Forest:", rf_grid_search.best_params_)
print("Best score for Random Forest:", rf_grid_search.best_score_)

# Predictions and classification report for Random Forest
rf_best_model = rf_grid_search.best_estimator_
rf_y_pred = rf_best_model.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_y_pred))

error_analysis_tuning(rf_best_model, X_test, y_test, test_x)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


KeyboardInterrupt: 

In [None]:
lgb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=lambda x: x.split(), sublinear_tf=True)),
    ('classifier', lgb.LGBMClassifier(random_state=42))
])
lgb_param_grid = {
    'classifier__num_leaves': np.arange(20, 150, 5),
    'classifier__max_depth': np.arange(3, 15, 1),
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__n_estimators': np.arange(50, 300, 10),
    'classifier__boosting_type': ['gbdt', 'dart'],
    'classifier__min_child_samples': np.arange(1, 20, 1),
    'classifier__subsample': [0.6, 0.8, 1.0]
}
lgb_grid_search = GridSearchCV(lgb_pipeline, lgb_param_grid, cv=3, scoring='accuracy', verbose=1)
lgb_grid_search.fit(X_train, y_train)

print("Best parameters for Random Forest:", lgb_grid_search.best_params_)
print("Best score for Random Forest:", lgb_grid_search.best_score_)

lgb_best_model = lgb_grid_search.best_estimator_
lgb_y_pred = lgb_best_model.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_y_pred))

error_analysis_tuning(lgb_best_model, X_test, y_test, test_x)