# Project 4 Notebook: NLP Classification

In [261]:
import pandas as pd
import numpy as np
#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
#for model-building
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\happy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\happy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\happy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Cleaning

In [262]:
df = pd.read_csv('./data/tweets.csv', encoding='unicode_escape')

In [263]:
df['emotion_in_tweet_is_directed_at'].fillna('Unknown', inplace=True)

In [264]:
df = df[df['tweet_text'].notna()]

In [265]:
df = df.drop(df[df['is_there_an_emotion_directed_at_a_brand_or_product'] == "I can't tell"].index)

## Pre-processing

In [266]:
#Functions base pulled and edited from another text classification project

#Convert to lowercase, strip and remove punctuation
def preprocess(text):
    text = str(text).lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

 
#Removes Stopwords
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)


#Lemmatization
wl = WordNetLemmatizer()
 
#Map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
#Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [267]:
#Final Pre-Processing function

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
df['clean_tweet_text'] = df['tweet_text'].apply(lambda x: finalpreprocess(x))
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,clean_tweet_text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,wesley g iphone hr tweet rise austin dead need...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,jessedee know fludapp awesome ipad iphone app ...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,swonderlin wait ipad also sale sxsw
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,sxsw hope year festival crashy year iphone app...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,sxtxstate great stuff fri sxsw marissa mayer g...


## Vectorization

In [268]:
#Train-test split
X = df["clean_tweet_text"]
y = df["is_there_an_emotion_directed_at_a_brand_or_product"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state = 42)

In [269]:
#TFIDF Vector

tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vec = tfidf_vectorizer.fit_transform(X_train) 
X_test_vec = tfidf_vectorizer.transform(X_test)

## ML Models

#### Evaluation Class

In [270]:
class ModelEval():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y):
        self.model = model
        self.name = model_name
        self.X_train_vec, self.X_test_vec, self.y_train, self.y_test = \
        train_test_split(X, y, random_state=42)
        
        y_predict_train = model.predict(X_train_vec)
        y_prob_train = model.predict_proba(X_train_vec)[:,1]
        
        y_predict_test = model.predict(X_test_vec)
        y_prob_test = model.predict_proba(X_test_vec)[:,1]
        
        # Attributes for cross validation
        self.cv_results = None
        self.cv_mean = None
        self.cv_std = None
        
        
        print('Training Report')
        
        print(classification_report(y_train,y_predict_train))
        
        print('Testing Report')
        
        print(classification_report(y_test,y_predict_test))
        
    def cross_validate(self, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = self.X_train_vec
        cv_y = self.y_train
        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_std = np.std(self.cv_results)
        
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)
    

#### Dummy

In [271]:
dummy_model = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)

In [279]:
dummy_eval = ModelEval(
    model = dummy_model,
    model_name = 'Dummy',
    X = X_train_vec,
    y = y_train
)

Training Report
                                    precision    recall  f1-score   support

                  Negative emotion       0.00      0.00      0.00       381
No emotion toward brand or product       0.60      1.00      0.75      3776
                  Positive emotion       0.00      0.00      0.00      2098

                          accuracy                           0.60      6255
                         macro avg       0.20      0.33      0.25      6255
                      weighted avg       0.36      0.60      0.45      6255

Testing Report
                                    precision    recall  f1-score   support

                  Negative emotion       0.00      0.00      0.00       189
No emotion toward brand or product       0.60      1.00      0.75      1612
                  Positive emotion       0.00      0.00      0.00       880

                          accuracy                           0.60      2681
                         macro avg       0.20      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [280]:
dummy_eval.cross_validate()

CV Results for `Dummy` model:
            0.60478 ± 0.00097 accuracy
        


#### Logistic Regression

In [281]:
#Model
        
logreg_tf = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
logreg_tf.fit(X_train_vec, y_train) 

In [282]:
logreg_eval = ModelEval(
    model = logreg_tf,
    model_name = 'LogReg',
    X = X_train_vec,
    y = y_train
)

Training Report
                                    precision    recall  f1-score   support

                  Negative emotion       0.99      0.83      0.90       381
No emotion toward brand or product       0.90      0.96      0.93      3776
                  Positive emotion       0.92      0.84      0.88      2098

                          accuracy                           0.91      6255
                         macro avg       0.94      0.88      0.90      6255
                      weighted avg       0.91      0.91      0.91      6255

Testing Report
                                    precision    recall  f1-score   support

                  Negative emotion       0.62      0.24      0.35       189
No emotion toward brand or product       0.71      0.81      0.76      1612
                  Positive emotion       0.58      0.50      0.54       880

                          accuracy                           0.67      2681
                         macro avg       0.64      0

In [283]:
logreg_eval.cross_validate()

CV Results for `LogReg` model:
            0.68365 ± 0.01985 accuracy
        


#### Naive Bayes

In [284]:
#Model

naive_bay = MultinomialNB()

naive_bay.fit(X_train_vec, y_train)

In [285]:
NB_eval = ModelEval(
    model = naive_bay,
    model_name = 'NaiveBayes',
    X = X_train_vec,
    y = y_train
)

Training Report
                                    precision    recall  f1-score   support

                  Negative emotion       1.00      0.03      0.06       381
No emotion toward brand or product       0.72      0.99      0.83      3776
                  Positive emotion       0.91      0.46      0.61      2098

                          accuracy                           0.75      6255
                         macro avg       0.88      0.49      0.50      6255
                      weighted avg       0.80      0.75      0.71      6255

Testing Report
                                    precision    recall  f1-score   support

                  Negative emotion       0.50      0.01      0.01       189
No emotion toward brand or product       0.65      0.95      0.77      1612
                  Positive emotion       0.69      0.25      0.36       880

                          accuracy                           0.65      2681
                         macro avg       0.61      0

In [286]:
NB_eval.cross_validate()

CV Results for `NaiveBayes` model:
            0.64592 ± 0.00731 accuracy
        


#### Random Forests

In [287]:
#Model
    
rf = RandomForestClassifier(n_estimators = 1000, min_samples_split = 25, min_samples_leaf = 4, max_features = 4200, max_depth = 20, n_jobs = -1)
rf.fit(X_train_vec, y_train)

In [288]:
rf_eval = ModelEval(
    model = rf,
    model_name = 'Random Forests',
    X = X_train_vec,
    y = y_train
)

Training Report
                                    precision    recall  f1-score   support

                  Negative emotion       0.95      0.09      0.17       381
No emotion toward brand or product       0.75      0.97      0.84      3776
                  Positive emotion       0.87      0.55      0.67      2098

                          accuracy                           0.77      6255
                         macro avg       0.85      0.54      0.56      6255
                      weighted avg       0.80      0.77      0.75      6255

Testing Report
                                    precision    recall  f1-score   support

                  Negative emotion       0.56      0.05      0.09       189
No emotion toward brand or product       0.66      0.89      0.75      1612
                  Positive emotion       0.57      0.32      0.41       880

                          accuracy                           0.64      2681
                         macro avg       0.60      0

In [289]:
rf_eval.cross_validate()

CV Results for `Random Forests` model:
            0.65956 ± 0.01918 accuracy
        


## Improving Logistic Regression Model

#### LogReg Changed Weights

In [296]:
#Model
        
logreg_tf_cweights = LogisticRegression(solver = 'liblinear', class_weight={'Negative emotion':.75, 'No emotion toward brand or product':.075, 'Positive emotion':.175}, C=10, penalty = 'l2')
logreg_tf_cweights.fit(X_train_vec, y_train) 
logreg_tf_cweights.fit(X_test_vec, y_test) 

In [297]:
logreg_cweights_eval = ModelEval(
    model = logreg_tf_cweights,
    model_name = 'LogReg Change Weights',
    X = X_train_vec,
    y = y_train
)

Training Report
                                    precision    recall  f1-score   support

                  Negative emotion       0.34      0.38      0.36       381
No emotion toward brand or product       0.73      0.74      0.74      3776
                  Positive emotion       0.56      0.53      0.54      2098

                          accuracy                           0.65      6255
                         macro avg       0.54      0.55      0.55      6255
                      weighted avg       0.65      0.65      0.65      6255

Testing Report
                                    precision    recall  f1-score   support

                  Negative emotion       0.79      0.97      0.87       189
No emotion toward brand or product       0.91      0.88      0.89      1612
                  Positive emotion       0.82      0.82      0.82       880

                          accuracy                           0.87      2681
                         macro avg       0.84      0

In [298]:
logreg_cweights_eval.cross_validate()

CV Results for `LogReg Change Weights` model:
            0.66745 ± 0.01797 accuracy
        


#### LogReg CW + More Iter

In [301]:
para=[{'max_iter':[1,10,100,100],
     'class_weight': {'Negative emotion':.75, 'No emotion toward brand or product':.075, 'Positive emotion':.175},
     'C':[1, 5, 10]}]

In [303]:
gs = GridSearchCV(LogisticRegression, param_grid=para, cv=5, scoring='r2')

NameError: name 'GridSearchCV' is not defined

In [None]:
#Model
        
logreg_cw_more_iter = LogisticRegression(solver = 'liblinear', class_weight={'Negative emotion':.75, 'No emotion toward brand or product':.075, 'Positive emotion':.175}, C=10, penalty = 'l2')
logreg_cw_more_iter.fit(X_train_vec, y_train) 
logreg_cw_more_iter.fit(X_test_vec, y_test) 

In [None]:
logreg_cw_more_iter_eval = ModelEval(
    model = logreg_cw_more_iter,
    model_name = 'LogReg Change Weights + More Iterations',
    X = X_train_vec,
    y = y_train
)

In [None]:
logreg_cw_more_iter_eval.cross_validate()