# Project 4 Notebook: NLP Classification

In [None]:
#Imports

import pandas as pd
import numpy as np
#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
#for model-building
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## Data Cleaning

In [None]:
#Init Dataframe

df = pd.read_csv('./data/tweets.csv', encoding='unicode_escape')

In [None]:
#Filling n/a in 'directed at' column with 'Unknown' to be able to use most of data

df['emotion_in_tweet_is_directed_at'].fillna('Unknown', inplace=True)

In [None]:
#Getting rid of n/a in text column

df = df[df['tweet_text'].notna()]

In [None]:
#Getting rid of "I can't tell" results because uneccessary classification

df = df.drop(df[df['is_there_an_emotion_directed_at_a_brand_or_product'] == "I can't tell"].index)

## Pre-processing

In [None]:
#Functions base pulled and edited from another text classification project

#Convert to lowercase, strip and remove punctuation
def preprocess(text):
    text = str(text).lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

 
#Removes Stopwords
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)


#Lemmatization
wl = WordNetLemmatizer()
 
#Map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
#Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [None]:
#Final Pre-Processing function

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
df['clean_tweet_text'] = df['tweet_text'].apply(lambda x: finalpreprocess(x))
df.head()

## Vectorization

In [None]:
#Train-test split
X = df["clean_tweet_text"]
y = df["is_there_an_emotion_directed_at_a_brand_or_product"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state = 42)

In [None]:
#TFIDF Vector

tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vec = tfidf_vectorizer.fit_transform(X_train) 
X_test_vec = tfidf_vectorizer.transform(X_test)

## ML Models

#### Evaluation Class

In [None]:
#Model Evaluation Class to streamline process -- a couple parts taken form lecture 41/42 + Tristan's Project 3 Code

class ModelEval():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y):
        self.model = model
        self.name = model_name
        self.X_train_vec, self.X_test_vec, self.y_train, self.y_test = \
        train_test_split(X, y, random_state=42)
        
        y_predict_train = model.predict(X_train_vec)
        y_prob_train = model.predict_proba(X_train_vec)[:,1]
        
        y_predict_test = model.predict(X_test_vec)
        y_prob_test = model.predict_proba(X_test_vec)[:,1]
        
        # Attributes for cross validation
        self.cv_results = None
        self.cv_mean = None
        self.cv_std = None
        
        
        print('Training Report')
        
        print(classification_report(y_train,y_predict_train))
        
        print('Testing Report')
        
        print(classification_report(y_test,y_predict_test))
        
    def cross_validate(self, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = self.X_train_vec
        cv_y = self.y_train
        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_std = np.std(self.cv_results)
        
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)
    

#### Dummy

In [None]:
#Starting with a Dummy model to create baseline

dummy_model = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)

In [None]:
#Evaluation

dummy_eval = ModelEval(
    model = dummy_model,
    model_name = 'Dummy',
    X = X_train_vec,
    y = y_train
)

In [None]:
dummy_eval.cross_validate()

#### Logistic Regression

In [None]:
#Next is a LogReg model (basic) as natural first solution to this type of problem

#Model
        
logreg_tf = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
logreg_tf.fit(X_train_vec, y_train) 

In [None]:
#Evaluation

logreg_eval = ModelEval(
    model = logreg_tf,
    model_name = 'LogReg',
    X = X_train_vec,
    y = y_train
)

In [None]:
logreg_eval.cross_validate()

#### Naive Bayes

In [None]:
#Attempt at a Naive Bayes Classifier not giving fruitful results off the bat so was scrapped

#Model

naive_bay = MultinomialNB()

naive_bay.fit(X_train_vec, y_train)

In [None]:
#Evaluation

NB_eval = ModelEval(
    model = naive_bay,
    model_name = 'NaiveBayes',
    X = X_train_vec,
    y = y_train
)

In [None]:
NB_eval.cross_validate()

#### Random Forests

In [None]:
#Attempt at a Random Forests Classifier not giving fruitful results off the bat, plus taking too long to find useful parameters,  so was scrapped

#Model
    
rf = RandomForestClassifier(n_estimators = 1000, min_samples_split = 25, min_samples_leaf = 4, max_features = 4200, max_depth = 20, n_jobs = -1)
rf.fit(X_train_vec, y_train)

In [None]:
#param=[{'n_estimators' : [1, 10, 100, 1000],
    #'min_samples_split':[1,10,50],
    #'min_samples_leaf':[1, 10, 50], 
    #'max_features' : [1, 10, 100, 1000, 10000],
       #'max_depth' : [1, 10, 100]}]

In [None]:
#gs_rf = GridSearchCV(RandomForestClassifier(), param_grid=param, cv=5, verbose=1, n_jobs = -1)

In [None]:
#gs_rf.fit(X_train_vec, y_train)

In [None]:
#gs_rf.best_estimator_

In [None]:
#Evaluation

rf_eval = ModelEval(
    model = rf,
    model_name = 'Random Forests',
    X = X_train_vec,
    y = y_train
)

In [None]:
rf_eval.cross_validate()

## Improving Logistic Regression Model

#### LogReg Changed Weights

In [None]:
#Going with tuning a Logistic Regression model for final model

#Model
        
logreg_tf_cweights = LogisticRegression(solver = 'liblinear', class_weight={'Negative emotion':.75, 'No emotion toward brand or product':.075, 'Positive emotion':.175}, C=10, penalty = 'l2')
logreg_tf_cweights.fit(X_train_vec, y_train) 
logreg_tf_cweights.fit(X_test_vec, y_test) 

In [None]:
#Evaluation

logreg_cweights_eval = ModelEval(
    model = logreg_tf_cweights,
    model_name = 'LogReg Change Weights',
    X = X_train_vec,
    y = y_train
)

In [None]:
logreg_cweights_eval.cross_validate()

#### LogReg CW + GridSearch

In [None]:
#Decided a GridSearch was necessary after trying to tune model by hand, best parameters chosen resulted in final model

#Model
        
logreg_gs = LogisticRegression(solver = 'liblinear', max_iter=10, tol=0.001, class_weight={'Negative emotion':.75, 'No emotion toward brand or product':.075, 'Positive emotion':.175}, C=1, penalty = 'l2')
logreg_gs.fit(X_train_vec, y_train) 
logreg_gs.fit(X_test_vec, y_test) 

In [None]:
para=[{'solver' : ['liblinear','lbfgs'],
    'max_iter':[1,10,100],
    'C':[0.0001, 0.001, 0.01, 0.1, 1], 
    'tol' : [0.0001, 0.001, 0.01, 0.1]}]

In [None]:
gs = GridSearchCV(logreg_tf_cweights, param_grid=para, cv=5, verbose=1)

In [None]:
gs.fit(X_train_vec, y_train)

In [None]:
gs.best_estimator_

In [None]:
#Evaluation

logreg_gs_eval = ModelEval(
    model = logreg_gs,
    model_name = 'LogReg Change To GridSearch Params',
    X = X_train_vec,
    y = y_train
)

In [None]:
logreg_gs_eval.cross_validate()