# Project 4 Notebook: NLP Classification

In [1]:
import pandas as pd
import numpy as np
#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\happy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\happy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\happy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Cleaning

In [2]:
df = pd.read_csv('./data/tweets.csv', encoding='unicode_escape')

In [3]:
df['emotion_in_tweet_is_directed_at'].fillna('Unknown', inplace=True)

In [4]:
df = df[df['tweet_text'].notna()]

In [5]:
df = df.drop(df[df['is_there_an_emotion_directed_at_a_brand_or_product'] == "I can't tell"].index)

## Pre-processing

In [6]:
#class base pulled and edited from another text classification project

#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = str(text).lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

 
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [7]:
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
df['clean_tweet_text'] = df['tweet_text'].apply(lambda x: finalpreprocess(x))
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,clean_tweet_text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,wesley g iphone hr tweet rise austin dead need...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,jessedee know fludapp awesome ipad iphone app ...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,swonderlin wait ipad also sale sxsw
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,sxsw hope year festival crashy year iphone app...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,sxtxstate great stuff fri sxsw marissa mayer g...


## Vectorization

In [17]:
#Train-test split

X_train, X_test, y_train, y_test = train_test_split(df["clean_tweet_text"],df["is_there_an_emotion_directed_at_a_brand_or_product"], train_size=0.7, test_size=0.3, random_state = 42)

In [18]:
#TFIDF Vector

tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vec = tfidf_vectorizer.fit_transform(X_train) 
X_test_vec = tfidf_vectorizer.transform(X_test)

## ML Models

#### Logistic Regression

In [19]:
#Model

logreg_tf = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
logreg_tf.fit(X_train_vec, y_train) 

In [20]:
#Predictions

y_predict = logreg_tf.predict(X_test_vec)
y_prob = logreg_tf.predict_proba(X_test_vec)[:,1]

In [21]:
print(classification_report(y_test,y_predict))

                                    precision    recall  f1-score   support

                  Negative emotion       0.62      0.24      0.35       189
No emotion toward brand or product       0.71      0.81      0.76      1612
                  Positive emotion       0.58      0.50      0.54       880

                          accuracy                           0.67      2681
                         macro avg       0.64      0.52      0.55      2681
                      weighted avg       0.66      0.67      0.66      2681



#### Naive Bayes

In [22]:
#Model

naive_bay = MultinomialNB()
naive_bay.fit(X_train_vec, y_train)

In [23]:
#Predictions

y_predict = naive_bay.predict(X_test_vec)
y_prob = naive_bay.predict_proba(X_test_vec)[:,1]

In [24]:
print(classification_report(y_test,y_predict))

                                    precision    recall  f1-score   support

                  Negative emotion       0.50      0.01      0.01       189
No emotion toward brand or product       0.65      0.95      0.77      1612
                  Positive emotion       0.69      0.25      0.36       880

                          accuracy                           0.65      2681
                         macro avg       0.61      0.40      0.38      2681
                      weighted avg       0.65      0.65      0.58      2681

