# YouTube Comments Sentiment Analysis 
Spring 2018


## 1 Set Up

### 1.1 Import Basic Modules

In [1]:
# Basics
import pandas as pd; import os
import csv; import numpy as np
import re; import warnings
warnings.filterwarnings('ignore')

### 1.2 Read In Data

In [2]:
os.chdir('/Users/andiedonovan/myProjects/Youtube_Python_Project/AndiesBranch/') # change directory

In [3]:
okgo = pd.read_csv('data/OKGOcomments.csv', delimiter=";", skiprows=2, encoding='latin-1', engine='python') # read in the data
blogs = pd.read_csv('data/Kagel_social_media_blogs.csv', delimiter="@@@", skiprows=2, encoding='latin-1', engine='python') # read in the data
tweets = pd.read_csv('data/full-corpus.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data
trump = pd.read_csv('data/trump.csv', delimiter="@@@", skiprows=2, encoding='utf-8', error_bad_lines=False, engine='python') 
df = pd.read_csv('data/data.csv', delimiter="@@@", skiprows=2, encoding='utf-8', engine='python') 

### 1.3 Clean Data Columns

In [4]:
tweets = tweets.drop(['Topic', 'TweetId', "TweetDate"], axis = 1).dropna()
tweets.columns = ["label", "comment"]
tweets.label = tweets.label.replace({'positive': '1.0', 'negative':'-1.0', 'neutral': '0.0', 'irrelevant': '0.0'}, regex=True)
tweets['label'] = pd.to_numeric(tweets['label'], errors='coerce')

In [5]:
blogs.columns = ["label", "comment"]
blogs['label'] = pd.to_numeric(blogs['label'], errors='coerce')
okgo.columns = [
  'label','comment','a','b']
okgo = okgo.drop(['a', 'b'], axis = 1).dropna() # drop columns 3 and 4 and missing values

In [6]:
data = pd.concat([okgo, blogs, tweets], ignore_index=False)

In [7]:
df.columns = ["comment", "label"]
trump.columns = ["label", "comment"]

In [8]:
data.sample(3)

Unnamed: 0,label,comment
3211,0.0,OmniTouch from #Microsoft makes every surface ...
1815,0.0,RT @dalmaer: WebGL infinite bookcase UI http:/...
1270,1.0,Mission Impossible 3 was excellent


### 1.4 Remove Non-Alphabetic Characters (including numbers)

In [9]:
df["comment"]= df["comment"].astype(str) 
trump["comment"]= trump["comment"].astype(str) 

In [10]:
def cleanerFn(b):
    for row in range(len(b)):
        line = b.loc[row, "comment"]
        b.loc[row,"comment"] = re.sub("[^a-zA-Z]", " ", line)
        
def cleanerFn2(b):
    for row in range(len(b)):
        line = b.iloc[row, 1]
        b.iloc[row,1] = re.sub("[^a-zA-Z]", " ", line)

In [11]:
cleanerFn(df)
cleanerFn2(data)
cleanerFn2(trump)

## 2 Natural Language Processing

### 2.1 Import Packages

In [12]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
sw = stopwords.words('english')
nltk.download('stopwords')
ps = PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
tfidf = TfidfVectorizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andiedonovan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 2.2 Tokenize Words

In [14]:
df['com_token']=df['comment'].str.lower().str.split()

### 2.3 Remove Stop Words, Lemmatization, Stemming

In [15]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag_sents

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/andiedonovan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [16]:
def nlpFunction(DF):
    DF['com_token'] = DF['comment'].str.lower().str.split()
    DF['com_remv'] = DF['com_token'].apply(lambda x: [y for y in x if y not in sw])
    DF["com_lemma"] = DF['com_remv'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x]) # lemmatization
    DF['com_stem'] = DF['com_lemma'].apply(lambda x : [ps.stem(y) for y in x]) # stemming
    DF["com_full"] = DF["com_stem"].apply(' '.join)
    DF["com_tagged"] = DF['com_token'].apply(lambda x : [nltk.pos_tag(y) for y in x]) #word tagging
    DF["com_stem_str"] = DF["com_stem"].apply(', '.join)
    return DF

In [17]:
df = nlpFunction(df)
df.head(5)

Unnamed: 0,comment,label,com_token,com_remv,com_lemma,com_stem,com_full,com_tagged,com_stem_str
0,How can he do this Japanese are trying to be ...,,"[how, can, he, do, this, japanese, are, trying...","[japanese, trying, respectful, lo, gan, logan,...","[japanese, trying, respectful, lo, gan, logan,...","[japanes, tri, respect, lo, gan, logan, care, ...",japanes tri respect lo gan logan care wtf,"[[(h, NN), (o, NN), (w, NN)], [(c, VB), (a, DT...","japanes, tri, respect, lo, gan, logan, care, wtf"
1,Prick,,[prick],[prick],[prick],[prick],prick,"[[(p, NN), (r, NN), (i, NN), (c, VBP), (k, NN)]]",prick
2,I think all the weeds are crying,,"[i, think, all, the, weeds, are, crying]","[think, weeds, crying]","[think, weed, cry]","[think, weed, cri]",think weed cri,"[[(i, NN)], [(t, NN), (h, NN), (i, NN), (n, VB...","think, weed, cri"
3,Lmao Americans in the comment section are acti...,,"[lmao, americans, in, the, comment, section, a...","[lmao, americans, comment, section, acting, li...","[lmao, american, comment, section, acting, lik...","[lmao, american, comment, section, act, like, ...",lmao american comment section act like nuke ja...,"[[(l, NN), (m, VBZ), (a, DT), (o, NN)], [(a, D...","lmao, american, comment, section, act, like, n..."
4,How many people want to kill him now,,"[how, many, people, want, to, kill, him, now]","[many, people, want, kill]","[many, people, want, kill]","[mani, peopl, want, kill]",mani peopl want kill,"[[(h, NN), (o, NN), (w, NN)], [(m, VB), (a, DT...","mani, peopl, want, kill"


In [18]:
df = nlpFunction(df)
data = nlpFunction(data)
trump = nlpFunction(trump)

In [19]:
df.head(5)

Unnamed: 0,comment,label,com_token,com_remv,com_lemma,com_stem,com_full,com_tagged,com_stem_str
0,How can he do this Japanese are trying to be ...,,"[how, can, he, do, this, japanese, are, trying...","[japanese, trying, respectful, lo, gan, logan,...","[japanese, trying, respectful, lo, gan, logan,...","[japanes, tri, respect, lo, gan, logan, care, ...",japanes tri respect lo gan logan care wtf,"[[(h, NN), (o, NN), (w, NN)], [(c, VB), (a, DT...","japanes, tri, respect, lo, gan, logan, care, wtf"
1,Prick,,[prick],[prick],[prick],[prick],prick,"[[(p, NN), (r, NN), (i, NN), (c, VBP), (k, NN)]]",prick
2,I think all the weeds are crying,,"[i, think, all, the, weeds, are, crying]","[think, weeds, crying]","[think, weed, cry]","[think, weed, cri]",think weed cri,"[[(i, NN)], [(t, NN), (h, NN), (i, NN), (n, VB...","think, weed, cri"
3,Lmao Americans in the comment section are acti...,,"[lmao, americans, in, the, comment, section, a...","[lmao, americans, comment, section, acting, li...","[lmao, american, comment, section, acting, lik...","[lmao, american, comment, section, act, like, ...",lmao american comment section act like nuke ja...,"[[(l, NN), (m, VBZ), (a, DT), (o, NN)], [(a, D...","lmao, american, comment, section, act, like, n..."
4,How many people want to kill him now,,"[how, many, people, want, to, kill, him, now]","[many, people, want, kill]","[many, people, want, kill]","[mani, peopl, want, kill]",mani peopl want kill,"[[(h, NN), (o, NN), (w, NN)], [(m, VB), (a, DT...","mani, peopl, want, kill"


In [20]:
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
 
def bigram_word_feats(DF, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(DF[""])
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
 
evaluate_classifier(bigram_word_feats)


NameError: name 'evaluate_classifier' is not defined

## 3 Data Transformations

### 3.1 Split into Training and Test Data

In [None]:
import sklearn # machine learning
from sklearn.model_selection import train_test_split # splitting up data

In [None]:
X_train = data["com_stem_str"]
X_test = trump["com_stem_str"]
Y_train = data["label"]
Y_test = trump["label"]
X_user = df["com_stem_str"]

### 3.2 Check for missing values

In [None]:
print('lengths training variables: ', len(X_train),",", len(Y_train))
print('lengths testing variables: ', len(X_test),",", len(Y_test), '\n')

print('Are there any missing values?', 
      '\n * Training:', pd.isnull(X_train).values.any(), ',', pd.isnull(Y_train).values.any(), 
      '\n * Testing: ', pd.isnull(X_test).values.any(), ",", pd.isnull(Y_test).values.any())


### 3.3 Transform Data to Counts 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [None]:
tfidf = TfidfVectorizer()

xtrain = tfidf.fit_transform(X_train) # transform and fit training data
xtest = tfidf.transform(X_test) # transform trump test data from fitted transformer
xuser = tfidf.transform(X_user) # transform user selected comments to predict on

data_trans= tfidf.transform(data["com_stem_str"]) # same as X_train...transform entire dataset for cross validation
df_trans = tfidf.transform(df["com_stem_str"]) # same as X_user

## 4 Machine Learning Models

In [None]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn import svm # support vector machine
from sklearn import metrics # for accuracy/ precision
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier # Stochastic Gradient Descent

### 4.1 Multinomial Naive Bayes Model

**Fitting the Model:**

In [None]:
mnb = MultinomialNB()
mnb.fit(xtrain, Y_train) # fit the model on the training data word counts and training data lables

**Model Predictions:** 

In [None]:
mnb_predict = mnb.predict(xtest) # make our y predictions (labels) on the comment test data
mnb_acc = metrics.accuracy_score(Y_test, mnb_predict)
print('We obtained ', round(mnb_acc, 6), '% accuracy for the model')

**Classification Report**

In [None]:
print(metrics.classification_report(Y_test, mnb_predict))

**Confusion Matrix**

In [None]:
metrics.confusion_matrix(Y_test, mnb_predict)

**Cross Validation of Accuracy:**

In [None]:
scores = cross_val_score(mnb, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### 4.2 Logistic Regression

**Fitting the Model:**

In [None]:
lr = LogisticRegression(solver='sag', max_iter=100, random_state=42, multi_class="multinomial") # set multinomial setting for multiclass data

**Model Predictions:**

In [None]:
lr.fit(xtrain, Y_train)

In [None]:
lr_predict = lr.predict(xtest)
lr_acc = metrics.accuracy_score(Y_test, lr_predict)
print('We obtained ', round(lr_acc, 6), '% accuracy for the logistic regression model')

**Classification Report:**

In [None]:
print(metrics.classification_report(Y_test, lr_predict))

**Confusion Matrix:**

In [None]:
metrics.confusion_matrix(Y_test, lr_predict)

**Cross Validation:**

In [None]:
scores = cross_val_score(lr, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### 4.3 Linear Support Vector Machine

**Fitting the Model & Predictions:**

In [None]:
svm = svm.SVC()
svm.fit(xtrain, Y_train)
svm_predict = svm.predict(xtest)
svm_acc = metrics.accuracy_score(Y_test, svm_predict)
print('We obtained ', round(svm_acc, 6), '% accuracy for the SVM model')

**Classification Report:**

In [None]:
print(metrics.classification_report(Y_test, mnb_predict))

**Confusion Matrix:**

In [None]:
metrics.confusion_matrix(Y_test, lr_predict)

**Cross Validation:**

In [None]:
scores = cross_val_score(svm, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### 4.4 K-Nearest Neighbor

**Fitting Model & Predictions:**

In [None]:
from sklearn.neighbors import KNeighborsClassifier # k-NN ensemble method

knn = KNeighborsClassifier()
knn.fit(xtrain, Y_train)

knn_predict = knn.predict(xtest)
knn_acc = metrics.accuracy_score(Y_test, knn_predict)
print('We obtained ', round(knn_acc, 6), '% accuracy for the KNN Bagging model')

**Cross Validation:**

In [None]:
scores = cross_val_score(knn, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### 4.5 Random Forest

**Fitting Model & Predictions:**

In [None]:
from sklearn.ensemble import RandomForestClassifier # random forest ensemble method

ranfor = RandomForestClassifier(n_estimators=10, random_state=10)
ranfor = ranfor.fit(xtrain, Y_train)

rf_predict = ranfor.predict(xtest)
rf_acc = metrics.accuracy_score(Y_test, rf_predict)
print('We obtained ', round(rf_acc, 6), '% accuracy for the Random Forest model')

**Cross Validation:**

In [None]:
scores = cross_val_score(ranfor, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
import warnings
warnings.filterwarnings('ignore')

### 4.6 Extreme Gradient Boosting

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(xtrain, Y_train)
xgb_pred = xgb.predict(xtest)
xgb_acc = metrics.accuracy_score(Y_test, xgb_pred)
print('We obtained ', round(xgb_acc, 6), '% accuracy for the XGB Bagging model')

In [None]:
scores = cross_val_score(xgb, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## 5 Data Visualizations

### 5.1 Table of Model Results

In [None]:
myTable = pd.DataFrame(columns=['Naive Bayes','Support Vect Machine','Logistic Regression', 'K-NN', 'Random Forest'],
                   index=["Accuracy"])
myTable['Naive Bayes']=mnb_acc; myTable['Support Vect Machine']=svm_acc; myTable['Logistic Regression']=lr_acc
myTable['K-NN']= knn_acc; myTable['Random Forest']= rf_acc
myTable