In [1]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import seaborn as sns

import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import umap.umap_ as umap

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, OPTICS
from sklearn.mixture import GaussianMixture

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [3]:
# For handling warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\utgoy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utgoy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utgoy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utgoy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Download the data

In [6]:
train_data = pd.read_csv('./data/train.csv')#.dropna()
val_data = pd.read_csv('./data/val.csv')#.dropna()
test_data = pd.read_csv('./data/test.csv')#.dropna()

In [7]:
# get all train data (labelled and unlabelled)
X_train    = train_data['Phrase']
y_train    = train_data['Sentiment']

# get only labelled train data
mask = (y_train != -100)
train_data_clean    = train_data[mask]
X_train_clean    = X_train[mask]
y_train_clean    = y_train[mask]

# get val data
X_val    = val_data['Phrase']
y_val    = val_data['Sentiment']

# get test data
X_test     = test_data['Phrase']

print(f"Train Data Shape: {X_train.shape}")
print(f"Cleaned Train Data Shape: {train_data_clean['Phrase'].shape}")
print(f"Validation Data Shape: {X_val.shape}")
print(f"Test Data Shape: {X_test.shape}")

print(" ")
print(f"Number of labels = 0 in train dataset as percentage: {((y_train == 0).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 1 in train dataset as percentage: {((y_train == 1).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 2 in train dataset as percentage: {((y_train == 2).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 3 in train dataset as percentage: {((y_train == 3).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 4 in train dataset as percentage: {((y_train == 4).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = -100 in train dataset as percentage: {((y_train == -100).sum() / (X_train.shape[0])) * 100:0.2f}%")

print(" ")
print(f"Number of labels = 0 in val dataset as percentage: {((y_val == 0).sum() / (X_val.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 1 in val dataset as percentage: {((y_val == 1).sum() / (X_val.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 2 in val dataset as percentage: {((y_val == 2).sum() / (X_val.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 3 in val dataset as percentage: {((y_val == 3).sum() / (X_val.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 4 in val dataset as percentage: {((y_val == 4).sum() / (X_val.shape[0])) * 100:0.2f}%")
print(f"Number of labels = -100 in val dataset as percentage: {((y_val == -100).sum() / (X_val.shape[0])) * 100:0.2f}%")

Train Data Shape: (59706,)
Cleaned Train Data Shape: (24758,)
Validation Data Shape: (23256,)
Test Data Shape: (23257,)
 
Number of labels = 0 in train dataset as percentage: 8.33%
Number of labels = 1 in train dataset as percentage: 8.95%
Number of labels = 2 in train dataset as percentage: 5.33%
Number of labels = 3 in train dataset as percentage: 9.60%
Number of labels = 4 in train dataset as percentage: 9.26%
Number of labels = -100 in train dataset as percentage: 58.53%
 
Number of labels = 0 in val dataset as percentage: 19.63%
Number of labels = 1 in val dataset as percentage: 20.27%
Number of labels = 2 in val dataset as percentage: 20.42%
Number of labels = 3 in val dataset as percentage: 19.81%
Number of labels = 4 in val dataset as percentage: 19.88%
Number of labels = -100 in val dataset as percentage: 0.00%


# Cleaning Data

## Preprocessing Helper Functions

In [10]:
def clean(text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    #added substitutions

    #***********added substitutions***********
    # remove all the special characters
    texter = re.sub(r'\W', ' ', texter)
    # remove all single characters
    texter = re.sub(r'\s+[a-zA-Z]\s+', ' ', texter)
    # Remove single characters from the start
    texter = re.sub(r'\^[a-zA-Z]\s+', ' ', texter)
    # Remove numbers
    texter = re.sub(r'\d+', ' ', texter)
    # Converting to Lowercase
    texter = texter.lower()
    # Remove punctuation
    texter = re.sub(r'[^\w\s]', ' ', texter)
    # Remove parentheses
    texter = re.sub(r'\([^)]*\)', ' ', texter)
    # Remove single quotes
    texter = re.sub(r'\'', ' ', texter)
    # Substituting multiple spaces with single space
    texter = re.sub(r'\s+', ' ', texter, flags=re.I)

    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

def clean_dataset(dataset):
    for row in range(dataset.shape[0]):
        dataset[row,0] = clean(dataset[row,0])
    return dataset

def tokenize_lexicon(texts):
    return_texts = []
    for i in range(len(texts)):
        return_texts.append(nltk.word_tokenize(texts[i]))
        return_texts[i] = nltk.pos_tag(return_texts[i])
    return return_texts

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wn.ADJ
    elif pos_tag.startswith('V'):
        return wn.VERB
    elif pos_tag.startswith('N'):
        return wn.NOUN
    elif pos_tag.startswith('R'):
        return wn.ADV
    else:
        return wn.NOUN

def lemmatize_texts(texts):
    return_texts = []
    lemmer = nltk.stem.WordNetLemmatizer()
    for i in range(len(texts)):
        return_texts.append([])
        for j in range(len(texts[i])):
                return_texts[i].append(lemmer.lemmatize(texts[i][j][0], pos=get_wordnet_pos(texts[i][j][1])))
    return return_texts

def stem_texts(texts):
    return_texts = []
    ps = PorterStemmer()
    for i in range(len(texts)):
        return_texts.append([])
        for j in range(len(texts[i])):
                return_texts[i].append(ps.stem(texts[i][j][0]))
    return return_texts


def backtostring(texts):
    return_texts = []
    for i in range(len(texts)):
        return_texts.append(" ".join(texts[i]))
    return return_texts

In [11]:
def pre_process(data):
    preproc_data = data.copy()
    
    preproc_data = preproc_data.str.lower()
    punctuation = string.punctuation
    mapping = str.maketrans("", "", punctuation)
    preproc_data = preproc_data.str.translate(mapping)
    
    stop_words = set(stopwords.words('english'))
    preproc_data = preproc_data.apply(lambda text: ' '.join([word for word in str(text).split() if word.lower() not in stop_words]))
    
    # lemmatizer = WordNetLemmatizer()
    # preproc_data = preproc_data.apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split()]))

    # stemmer = PorterStemmer()
    # preproc_data = preproc_data.apply(lambda text: ' '.join([stemmer.stem(word) for word in text.split()]))
    
    preproc_data = preproc_data.apply(lambda text: re.sub(r'@\w+', '', re.sub(r'http\S+|www\S+', '', text)))
    return preproc_data

# get the preprocessed data
X_train_preproc   = pre_process(X_train)
X_train_clean_preproc   = pre_process(X_train_clean)
X_val_preproc = pre_process(X_val)
X_test_preproc = pre_process(X_test)

# TF-IDF

In [13]:
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [14]:
tfidf = TfidfVectorizer(strip_accents='unicode', lowercase=True, tokenizer=tokenizer_porter, stop_words='english')
X_train_preproc_tfidf = tfidf.fit_transform(X_train_preproc)
print(f"\nTF-IDF feature matrix shape: {X_train_preproc_tfidf.shape}")


TF-IDF feature matrix shape: (59706, 10576)


## Only Using Clean Data

In [16]:
X_train_clean_tfidf = tfidf.transform(X_train_clean_preproc)
print(f"\nTF-IDF feature matrix shape: {X_train_clean_tfidf.shape}")


TF-IDF feature matrix shape: (24758, 10576)


In [17]:
X_val_tfidf = tfidf.transform(X_val_preproc)
print(f"\nTF-IDF feature matrix shape: {X_val_tfidf.shape}")


TF-IDF feature matrix shape: (23256, 10576)


In [18]:
X_test_tfidf = tfidf.transform(X_test_preproc)
print(f"\nTF-IDF feature matrix shape: {X_test_tfidf.shape}")


TF-IDF feature matrix shape: (23257, 10576)


# Modeling

## Hyperparameter Tunning

In [21]:
model_params = {
        'XGBoost': {
        'model': XGBClassifier(n_jobs=-1),
        "params": {
            'n_estimators': [1,10,100,500],
            'max_depth': [3,4,5,6,10],
        }
    },
    'LightGBM': {
        'model': LGBMClassifier(n_jobs=-1, verbose = -1,  verbose_eval=False),
        "params": {
            'n_estimators': [1,5,10,100],
            'max_depth': [1,2,3,4,5,6],
            'num_leaves': [100,200,300,500,1000]
        }
    },
}

In [22]:
# scores = []

# for model_name, mp in model_params.items():
#     print(model_name)
#     clf =  GridSearchCV(mp['model'], mp['params'], cv=3, return_train_score=False)
#     clf.fit(X_train_clean_tfidf, y_train_clean)

#     scores.append({
#         'model': model_name,
#         'f1_score': clf.best_score_,
#         'best_params': clf.best_params_
#     }) 

In [25]:
# df = pd.DataFrame(scores,columns=['model','best_score','best_params']).round(4)
# df

## Logistic Regression

In [28]:
lr = LogisticRegression(C=5, solver='saga', max_iter=1000, random_state=42)
lr.fit(X_train_clean_tfidf, y_train_clean)

# Validate on validation set
y_val_pred_lr = lr.predict(X_val_tfidf)
val_accuracy_lr = lr.score(X_val_tfidf, y_val)
print("\n--- Logistic Regression ---")
print(f"Validation Accuracy: {val_accuracy_lr:.4f}")
print(f"Validation F1 Score: {f1_score(y_val, y_val_pred_lr, average='weighted'):.4f}")
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred_lr))


--- Logistic Regression ---
Validation Accuracy: 0.8668
Validation F1 Score: 0.8665
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4564
           1       0.81      0.77      0.79      4714
           2       0.99      0.98      0.98      4748
           3       0.79      0.80      0.80      4606
           4       0.85      0.92      0.88      4624

    accuracy                           0.87     23256
   macro avg       0.87      0.87      0.87     23256
weighted avg       0.87      0.87      0.87     23256



## LightGBM


In [30]:
lgbm = LGBMClassifier(max_depth=6, n_estimators=100, num_leaves=100, n_jobs=-1, verbose = -1, verbose_eval=False)
lgbm.fit(X_train_clean_tfidf, y_train_clean)

In [31]:
y_val_pred_lgb_labels = lgbm.predict(X_val_tfidf)
val_accuracy_lgb = lgbm.score(X_val_tfidf, y_val)
print("\n--- LightGBM ---")
print(f"Validation Accuracy: {val_accuracy_lgb:.4f}")
print(f"Validation F1 Score: {f1_score(y_val, y_val_pred_lgb_labels, average='weighted'):.4f}")

print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred_lgb_labels))


--- LightGBM ---
Validation Accuracy: 0.8195
Validation F1 Score: 0.8201
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.79      0.82      4564
           1       0.80      0.70      0.75      4714
           2       0.98      0.93      0.96      4748
           3       0.78      0.79      0.79      4606
           4       0.71      0.89      0.79      4624

    accuracy                           0.82     23256
   macro avg       0.83      0.82      0.82     23256
weighted avg       0.83      0.82      0.82     23256



## XGBoost


In [33]:
xgbm = XGBClassifier(max_depth=10, n_estimators=500, n_jobs=-1)
xgbm.fit(X_train_clean_tfidf, y_train_clean)

In [34]:
y_val_pred_xgb_labels = xgbm.predict(X_val_tfidf)
val_accuracy_xgb = xgbm.score(X_val_tfidf, y_val)
print("\n--- XGBoost ---")
print(f"Validation Accuracy: {val_accuracy_xgb:.4f}")
print(f"Validation F1 Score: {f1_score(y_val, y_val_pred_xgb_labels, average='weighted'):.4f}")
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred_xgb_labels))


--- XGBoost ---
Validation Accuracy: 0.8709
Validation F1 Score: 0.8707
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.88      4564
           1       0.83      0.78      0.80      4714
           2       0.98      0.97      0.97      4748
           3       0.82      0.83      0.82      4606
           4       0.84      0.91      0.87      4624

    accuracy                           0.87     23256
   macro avg       0.87      0.87      0.87     23256
weighted avg       0.87      0.87      0.87     23256



## Prediction

In [54]:
from scipy.sparse import hstack, vstack

In [56]:
X = vstack([X_train_clean_tfidf, X_val_tfidf])
y = pd.concat([y_train_clean, y_val])

xgbm = XGBClassifier(max_depth=10, n_estimators=500, n_jobs=-1)
xgbm.fit(X, y)

In [57]:
test_data.shape

(23257, 2)

In [58]:
test_labels = xgbm.predict(X_test_tfidf)

test_data['Sentiment'] = test_labels

In [59]:
test_data[['PhraseID', 'Sentiment']].to_csv('submission_simple.csv', index=False)

In [None]:
# test_data[test_data['Phrase'].isnull()][['PhraseID', 'Sentiment']].head()