In [146]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score, silhouette_score
from sklearn.preprocessing import LabelEncoder

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [25]:
# For handling warnings
import warnings
warnings.filterwarnings('ignore')

In [27]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\utgoy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utgoy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utgoy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utgoy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Download the data

In [30]:
train_data = pd.read_csv('./data/train.csv')#.dropna()
val_data = pd.read_csv('./data/val.csv')#.dropna()
test_data = pd.read_csv('./data/test.csv')#.dropna()
df_gmm = pd.read_csv('./data/best_clusters.csv')

In [111]:
# get all train data (labelled and unlabelled)
X_train    = train_data['Phrase']
y_train    = df_gmm['Sentiment_Assigned']

# get only labelled train data
mask = (y_train != -100)
train_data_clean    = train_data[mask]
X_train_clean    = X_train[mask]
y_train_clean    = y_train[mask]

# get val data
X_val    = val_data['Phrase']
y_val    = val_data['Sentiment']

# get test data
X_test     = test_data['Phrase']



print(" Final Count Of Labels ")
print(f"Number of labels = 0 in train dataset as percentage: {((y_train == 0).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 1 in train dataset as percentage: {((y_train == 1).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 2 in train dataset as percentage: {((y_train == 2).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 3 in train dataset as percentage: {((y_train == 3).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 4 in train dataset as percentage: {((y_train == 4).sum() / (X_train.shape[0])) * 100:0.2f}%")
# print(f"Number of labels = -100 in train dataset as percentage: {((y_train.isnull()).sum() / (X_train.shape[0])) * 100:0.2f}%")

 Final Count Of Labels 
Number of labels = 0 in train dataset as percentage: 17.96%
Number of labels = 1 in train dataset as percentage: 18.65%
Number of labels = 2 in train dataset as percentage: 13.01%
Number of labels = 3 in train dataset as percentage: 19.72%
Number of labels = 4 in train dataset as percentage: 30.65%


# Vectorization

## Define Preprocessing Helper Functions

In [38]:
def clean(text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    #added substitutions

    #***********added substitutions***********
    # remove all the special characters
    texter = re.sub(r'\W', ' ', texter)
    # remove all single characters
    texter = re.sub(r'\s+[a-zA-Z]\s+', ' ', texter)
    # Remove single characters from the start
    texter = re.sub(r'\^[a-zA-Z]\s+', ' ', texter)
    # Remove numbers
    texter = re.sub(r'\d+', ' ', texter)
    # Converting to Lowercase
    texter = texter.lower()
    # Remove punctuation
    texter = re.sub(r'[^\w\s]', ' ', texter)
    # Remove parentheses
    texter = re.sub(r'\([^)]*\)', ' ', texter)
    # Remove single quotes
    texter = re.sub(r'\'', ' ', texter)
    # Substituting multiple spaces with single space
    texter = re.sub(r'\s+', ' ', texter, flags=re.I)

    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

def clean_dataset(dataset):
    for row in range(dataset.shape[0]):
        dataset[row,0] = clean(dataset[row,0])
    return dataset

def tokenize_lexicon(texts):
    return_texts = []
    for i in range(len(texts)):
        return_texts.append(nltk.word_tokenize(texts[i]))
        return_texts[i] = nltk.pos_tag(return_texts[i])
    return return_texts

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wn.ADJ
    elif pos_tag.startswith('V'):
        return wn.VERB
    elif pos_tag.startswith('N'):
        return wn.NOUN
    elif pos_tag.startswith('R'):
        return wn.ADV
    else:
        return wn.NOUN

def lemmatize_texts(texts):
    return_texts = []
    lemmer = nltk.stem.WordNetLemmatizer()
    for i in range(len(texts)):
        return_texts.append([])
        for j in range(len(texts[i])):
                return_texts[i].append(lemmer.lemmatize(texts[i][j][0], pos=get_wordnet_pos(texts[i][j][1])))
    return return_texts

def stem_texts(texts):
    return_texts = []
    ps = PorterStemmer()
    for i in range(len(texts)):
        return_texts.append([])
        for j in range(len(texts[i])):
                return_texts[i].append(ps.stem(texts[i][j][0]))
    return return_texts


def backtostring(texts):
    return_texts = []
    for i in range(len(texts)):
        return_texts.append(" ".join(texts[i]))
    return return_texts

In [40]:
def pre_process(data):
    preproc_data = data.copy()
    preproc_data = preproc_data.str.lower()
    punctuation = string.punctuation
    mapping = str.maketrans("", "", punctuation)
    preproc_data = preproc_data.str.translate(mapping)
    stop_words = set(stopwords.words('english'))
    preproc_data = preproc_data.apply(lambda text: ' '.join([word for word in str(text).split() if word.lower() not in stop_words]))
    # nltk.download('wordnet')
    # lemmatizer = WordNetLemmatizer()
    # preproc_data = preproc_data.apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split()]))

    stemmer = PorterStemmer()
    preproc_data = preproc_data.apply(lambda text: ' '.join([stemmer.stem(word) for word in text.split()]))
    
    preproc_data = preproc_data.apply(lambda text: re.sub(r'@\w+', '', re.sub(r'http\S+|www\S+', '', text)))
    return preproc_data

# get the preprocessed data
X_train_preproc   = pre_process(X_train)
X_train_clean_preproc   = pre_process(X_train_clean)
X_val_preproc = pre_process(X_val)
X_test_preproc = pre_process(X_test)

## TF-IDF

In [42]:
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [54]:
tfidf = TfidfVectorizer(strip_accents='unicode', lowercase=True, tokenizer=tokenizer_porter, stop_words='english')
X_train_preproc_tfidf = tfidf.fit_transform(X_train_preproc)
print(f"\nTF-IDF feature matrix shape: {X_train_preproc_tfidf.shape}")


TF-IDF feature matrix shape: (59706, 10456)


In [55]:
X_val_preproc_tfidf = tfidf.transform(X_val_preproc)
print(f"\nTF-IDF feature matrix shape: {X_val_preproc_tfidf.shape}")


TF-IDF feature matrix shape: (23256, 10456)


In [58]:
X_test_preproc_tfidf = tfidf.transform(X_test_preproc)
print(f"\nTF-IDF feature matrix shape: {X_test_preproc_tfidf.shape}")


TF-IDF feature matrix shape: (23257, 10456)


# Predictions

In [118]:
from scipy.sparse import hstack, vstack

In [120]:
X_train, y_train, X_val, y_val, X_test = X_train_preproc_tfidf, df_gmm['Sentiment_Assigned'], X_val_preproc_tfidf, y_val, X_test_preproc_tfidf

X = vstack([X_train, X_val])
y = pd.concat([y_train, y_val])

## Logistic Regression

In [123]:
lr = LogisticRegression(**{'n_jobs':-1,'C': 1.578, 'penalty': 'l2', 'solver': 'liblinear'})
lr.fit(X, y)

In [124]:
test_labels = lr.predict(X_test)

test_data['Sentiment'] = test_labels

test_data[['PhraseID', 'Sentiment']].to_csv('./results/submission_lr.csv', index=False)

# Scratch Pad

## MNB

In [148]:
mnb = MultinomialNB(alpha=0.133)
mnb.fit(X, y)

In [150]:
test_labels = mnb.predict(X_test)

test_data['Sentiment'] = test_labels

test_data[['PhraseID', 'Sentiment']].to_csv('./results/submission_mnb.csv', index=False)

## Random Forest

In [165]:
rf = RandomForestClassifier(min_samples_leaf=5, min_samples_split=5, n_estimators=500, n_jobs=-1, random_state=42)
rf.fit(X, y)

In [166]:
test_labels = rf.predict(X_test)

test_data['Sentiment'] = test_labels

test_data[['PhraseID', 'Sentiment']].to_csv('./results/submission_rf.csv', index=False)

## LightGBM

In [152]:
model_dict = {'colsample_bytree': 0.839, 'learning_rate': 0.058, 'max_depth': 20, 'min_child_samples': 20, 
              'n_estimators': 500, 'num_leaves': 50, 'subsample': 0.985,
              'n_jobs':-1, 'verbose': -1, 'verbose_eval':False,}

lgbm = LGBMClassifier(**model_dict)
lgbm.fit(X, y)

In [153]:
test_labels = lgbm.predict(X_test)

test_data['Sentiment'] = test_labels

test_data[['PhraseID', 'Sentiment']].to_csv('./results/submission_lgbm.csv', index=False)