In [6]:
## import necessary modules
import pandas as pd
import os
import csv
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn import svm # support vector machine
from sklearn import metrics # for accuracy/ precision
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier # Stochastic Gradient Descent
from sklearn.neighbors import KNeighborsClassifier # k-NN ensemble method
from sklearn.ensemble import RandomForestClassifier 

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import PorterStemmer
import sklearn # machine learning
from sklearn.model_selection import train_test_split # splitting up data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# set path
path = '/Users/andiedonovan/myProjects/Youtube_Python_Project/AndiesBranch/'
os.chdir(path) # change directory

# load in data 

# training data
okgo = pd.read_csv('data/OKGOcomments.csv', delimiter=";", skiprows=2, encoding='latin-1', engine='python') # read in the data
blogs = pd.read_csv('data/Kagel_social_media_blogs.csv', delimiter="@@@", skiprows=2, encoding='latin-1', engine='python') # read in the data
tweets = pd.read_csv('data/full-corpus.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data
# test data: 
trump = pd.read_csv('data/trump.csv', delimiter="@@@", skiprows=2, encoding='utf-8', error_bad_lines=False, engine='python') 
# combine training dataframes
df = pd.read_csv('data/data.csv', delimiter="@@@", skiprows=2, encoding='utf-8', engine='python') 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andiedonovan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andiedonovan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
trump.head(3)

Unnamed: 0,1,Hispanic for Trump!
0,-1,Taking down this video for copyright infringem...
1,-1,This speech literaly ripped off the Dark Knigh...
2,-1,Poop him


In [8]:
# clean dataframes 
tweets = tweets.drop(['Topic', 'TweetId', "TweetDate"], axis = 1).dropna()
tweets.columns = ["label", "comment"]
tweets.label = tweets.label.replace({'positive': '1.0', 'negative':'-1.0', 'neutral': '0.0', 'irrelevant': '0.0'}, regex=True)
tweets['label'] = pd.to_numeric(tweets['label'], errors='coerce')
blogs.columns = ["label", "comment"]
blogs['label'] = pd.to_numeric(blogs['label'], errors='coerce')
okgo.columns = [
  'label','comment','a','b']
okgo = okgo.drop(['a', 'b'], axis = 1).dropna() # drop columns 3 and 4 and missing values
data = pd.concat([okgo, blogs, tweets], ignore_index=False)
df.columns = ["comment", "label"]
trump.columns = ["label", "comment"]

In [9]:
# clean up textual data (remove symbols)
df["comment"]= df["comment"].astype(str) 
trump["comment"]= trump["comment"].astype(str) 

def cleanerFn(b):
    for row in range(len(b)):
        line = b.loc[row, "comment"]
        b.loc[row,"comment"] = re.sub("[^a-zA-Z]", " ", line)
        
def cleanerFn2(b):
    for row in range(len(b)):
        line = b.iloc[row, 1]
        b.iloc[row,1] = re.sub("[^a-zA-Z]", " ", line)

In [10]:
df_copy = df.copy()
df.shape

(467, 2)

In [11]:
trump["label"].value_counts()

 0    87
 1    56
-1    56
Name: label, dtype: int64

In [137]:
data["label"].value_counts()

 0.0    8020
 1.0    5188
-1.0     984
Name: label, dtype: int64

In [138]:
cleanerFn(df)
cleanerFn2(data)
cleanerFn2(trump)

In [139]:
sw = stopwords.words('english')
ps = PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
tfidf = TfidfVectorizer()

In [140]:
def nlpFunction(a):
    a['com_token']=a['comment'].str.lower().str.split()
    a['com_remv']=a['com_token'].apply(lambda x: [y for y in x if y not in sw])
    a["com_lemma"] = a['com_remv'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x]) # lemmatization
    a['com_stem']=a['com_lemma'].apply(lambda x : [ps.stem(y) for y in x]) # stemming
    a["com_stem_str"] = a["com_stem"].apply(', '.join)
    return a

In [141]:
df = nlpFunction(df)
data = nlpFunction(data)
trump = nlpFunction(trump)

In [142]:
X_train = data["com_stem_str"]
X_test = trump["com_stem_str"]
Y_train = data["label"]
Y_test = trump["label"]

X_user = df["com_stem_str"]

In [143]:
X_train.head(2)

0    everyon, know, brand, paper, one, know, welfar...
1                                   paper, cut, balanc
Name: com_stem_str, dtype: object

In [144]:
tfidf = TfidfVectorizer()
xtrain = tfidf.fit_transform(X_train) # transform and fit training data
xtest = tfidf.transform(X_test) # transform test data from fitted transformer
xuser = tfidf.transform(X_user)
data_trans= tfidf.transform(data["com_stem_str"]) # transform entire dataset for cross validation
df_trans = tfidf.transform(df["com_stem_str"])

In [145]:
xuser.shape

(467, 14712)

In [146]:
# running models
from sklearn.svm import SVC

rs = 10
lr = LogisticRegression(solver='sag', max_iter=100, random_state=rs, multi_class="multinomial")
mnb = MultinomialNB()
svm = svm.SVC()
rf = RandomForestClassifier(n_estimators=10, random_state=rs)
knn = KNeighborsClassifier()

In [147]:
models = ['lr', 'mnb', 'svm', 'rf', 'knn']
labels = ['label_' + str(models[i]) for i in range(0,len(models))]
predictions = [str(models[i])+"_predict" for i in range(0,len(models))]
d = {}
initModels = [lr, mnb, svm, rf, knn]

In [148]:
for i in range(0,5):
    initModels[i].fit(xtrain, Y_train)
    d[predictions[i]] = initModels[i].predict(xuser)

In [149]:
len(d[predictions[1]])

467

In [150]:
'''
from sklearn.metrics import accuracy_score
knn.fit(xtrain, Y_train)
pred = knn.predict(xtest)
accuracy_score(Y_test,pred)
'''

'\nfrom sklearn.metrics import accuracy_score\nknn.fit(xtrain, Y_train)\npred = knn.predict(xtest)\naccuracy_score(Y_test,pred)\n'

In [151]:
# Create table of prediction accuracy rates
Table = pd.DataFrame(columns=['comment', 'label_lr', 'label_mnb', 'label_svm', 'label_rf', 'label_knn'])
for i in range(0, len(models)):
    Table[labels[i]] = d[predictions[i]]

#Table['comment'] = df_copy['comment']

In [152]:
Table["comment"] = df["comment"]

In [153]:
Table.head()

Unnamed: 0,comment,label_lr,label_mnb,label_svm,label_rf,label_knn
0,Roses are Red,0.0,0.0,0.0,0.0,0.0
1,Violets are Blue,0.0,0.0,0.0,0.0,0.0
2,I was so happy,0.0,0.0,0.0,0.0,0.0
3,And you shouldve been too,0.0,0.0,0.0,0.0,0.0
4,When a pro fifa player plays against the lowes...,0.0,0.0,0.0,0.0,0.0


In [154]:
# Create table of predicted sentiment ratios
Ratios = pd.DataFrame(columns=['label_lr', 'label_mnb', 'label_svm', 'label_rf', 'label_knn'], 
    index=range(0,3))

def RatioFinder(model): 
    pos = Table[Table[model]== 1.0]
    neg = Table[Table[model]== -1.0]
    neu = Table[Table[model]== 0.0]

    pos_len = len(pos); neg_len = len(neg); neu_len = len(neu)

    total = pos_len + neg_len + neu_len
    
    neg_ratio = round(neg_len / float(total), 2) * 100
    pos_ratio = round(pos_len / float(total), 2) * 100
    neu_ratio = round(neu_len / float(total), 2) * 100
    
    ratios = [pos_ratio, neu_ratio, neg_ratio]
    
    return ratios

for i in range(0,3):
        for j in range(0,5):
            Ratios.iloc[i,j] = RatioFinder(labels[j])[i]


In [155]:
Ratios

Unnamed: 0,label_lr,label_mnb,label_svm,label_rf,label_knn
0,14,25,0,14,4
1,85,75,100,84,94
2,1,0,0,3,2


In [156]:
all_models = pd.DataFrame(columns=['average'], index=range(0,3))
all_models["average"]= Ratios.mean(axis=1)
all_models

Unnamed: 0,average
0,11.4
1,87.6
2,1.2


In [157]:
Table.head(5)

Unnamed: 0,comment,label_lr,label_mnb,label_svm,label_rf,label_knn
0,Roses are Red,0.0,0.0,0.0,0.0,0.0
1,Violets are Blue,0.0,0.0,0.0,0.0,0.0
2,I was so happy,0.0,0.0,0.0,0.0,0.0
3,And you shouldve been too,0.0,0.0,0.0,0.0,0.0
4,When a pro fifa player plays against the lowes...,0.0,0.0,0.0,0.0,0.0


In [158]:
# set the prediction to the mode of the row
Table["Prediction"] = 0
Table["Prediction"] = Table[['label_lr','label_mnb','label_svm','label_rf','label_knn']].mode(axis=1)

In [159]:
Table.head(5)

Unnamed: 0,comment,label_lr,label_mnb,label_svm,label_rf,label_knn,Prediction
0,Roses are Red,0.0,0.0,0.0,0.0,0.0,0.0
1,Violets are Blue,0.0,0.0,0.0,0.0,0.0,0.0
2,I was so happy,0.0,0.0,0.0,0.0,0.0,0.0
3,And you shouldve been too,0.0,0.0,0.0,0.0,0.0,0.0
4,When a pro fifa player plays against the lowes...,0.0,0.0,0.0,0.0,0.0,0.0


In [160]:
Table.head(3)

Unnamed: 0,comment,label_lr,label_mnb,label_svm,label_rf,label_knn,Prediction
0,Roses are Red,0.0,0.0,0.0,0.0,0.0,0.0
1,Violets are Blue,0.0,0.0,0.0,0.0,0.0,0.0
2,I was so happy,0.0,0.0,0.0,0.0,0.0,0.0


In [161]:
df["com_remv"] = df["com_remv"].apply(', '.join)
df["com_remv"] = df["com_remv"].str.replace(",","").astype(str)

In [162]:
df.label = Table["Prediction"]

In [163]:
df.head()

Unnamed: 0,comment,label,com_token,com_remv,com_lemma,com_stem,com_stem_str
0,Roses are Red,0.0,"[roses, are, red]",roses red,"[rose, red]","[rose, red]","rose, red"
1,Violets are Blue,0.0,"[violets, are, blue]",violets blue,"[violet, blue]","[violet, blue]","violet, blue"
2,I was so happy,0.0,"[i, was, so, happy]",happy,[happy],[happi],happi
3,And you shouldve been too,0.0,"[and, you, shouldve, been, too]",shouldve,[shouldve],[shouldv],shouldv
4,When a pro fifa player plays against the lowes...,0.0,"[when, a, pro, fifa, player, plays, against, t...",pro fifa player plays lowest difficulty bots,"[pro, fifa, player, play, lowest, difficulty, ...","[pro, fifa, player, play, lowest, difficulti, ...","pro, fifa, player, play, lowest, difficulti, bot"


In [180]:
df["com_remv"] = df["com_remv"].apply(', '.join)
df["com_remv"] = df["com_remv"].str.replace(",","").astype(str)

df_words = df[["label","com_remv"]]
positive = df_words[df_words["label"]==1.0]
neutral = df_words[df_words["label"]==0.0]
negative = df_words[df_words["label"]==-1.0]

In [181]:

# most frequent words in each label
most_freq_pos = pd.Series(' '.join(positive).lower().split()).value_counts()[:10]
most_freq_neg = pd.Series(' '.join(negative).lower().split()).value_counts()[:10]
most_freq_neu = pd.Series(' '.join(neutral).lower().split()).value_counts()[:10]

In [183]:
df_words["label"].value_counts()

 0.0    435
 1.0     31
-1.0      1
Name: label, dtype: int64

In [178]:
# extracting comments for each label
pos1 = df[df["label"]==1.0]; 
positive = pos1["com_remv"]
neg1 = df[df["label"]==-1.0]; 
negative = neg1["com_remv"]
neu1 = df[df["label"]==0.0]; 
neutral = neu1["com_remv"]

# most frequent words in each label
most_freq_pos = pd.Series(' '.join(positive).lower().split()).value_counts()[:10]
most_freq_neg = pd.Series(' '.join(negative).lower().split()).value_counts()[:10]
most_freq_neu = pd.Series(' '.join(neutral).lower().split()).value_counts()[:10]

KeyError: 'label'

In [165]:
#df[["comment", "label"]].where(df[['label']] == 1.0)

In [173]:
pos1[["comment", "label"]].head(2)

Unnamed: 0,comment,label
12,One of the best football games played ever ...,1.0
18,I like germany team,1.0


In [167]:
import dash_html_components as html
def generate_table(dataframe, max_rows=10):
    return html.Table(
        # Header
        [html.Tr([html.Th(col) for col in dataframe.columns])] +
        # Body
        [html.Tr([
            html.Td(dataframe.iloc[i][col]) for col in dataframe.columns
        ]) for i in range(min(len(dataframe), max_rows))]
    )

def table_update(selected_dropdown_value):
    simple_df = data[["label","comment"]]
    selected = {"Positive": 1.0, "Neutral": 0.0, "Negative": -1.0}
    filtered_df = simple_df[simple_df["label"]==selected.get(selected_dropdown_value)]
    #return generate_table(filtered_df)
    return filtered_df.head(5)

In [168]:
table_update("Positive")

Unnamed: 0,label,comment
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...
3,1.0,Blowing my mind yet again
5,1.0,Made me smile Great work
7,1.0,The mad methodical geniuses do it again
8,1.0,Am I the only person who actually loves their ...


In [169]:
simple_df = data[["label","comment"]]
selected_dropdown_value = 
selected = {"Positive": 1.0, "Neutral": 0.0, "Negative": -1.0}
filtered_df = simple_df[simple_df["label"]==selected.get(selected_dropdown_value)]

In [190]:
def generate_table(dataframe, max_rows=10):
    return html.Table(
        [html.Tr([html.Th(col) for col in dataframe.columns])] +
        [html.Tr([
            html.Td(dataframe.iloc[i][col]) for col in dataframe.columns
        ]) for i in range(min(len(dataframe), max_rows))]
    )
selected_dropdown_value = "All Comments"
def table_update(value):
    simple_df = data[["label","comment"]]
    selected = {"Positive": 1.0, "Neutral": 0.0, "Negative": -1.0}
    if value != "All Comments":
        filtered_df = simple_df[simple_df["label"]==selected.get(value)]
    else: 
         filtered_df = simple_df
    return generate_table(filtered_df)

In [191]:
table_update(selected_dropdown_value)

Table([Tr([Th('label'), Th('comment')]), Tr([Td(-1.0), Td('Everyone knows brand s papers from  But  No one  knows about welfare of employees     It s so bad bad bad   bad     ')]), Tr([Td(0.0), Td(' Your paper cut balance is                 ')]), Tr([Td(1.0), Td('OH SHIT WHEN I SAW THIS ON MY FRONT PAGE       I LOVE THIS SONG')]), Tr([Td(1.0), Td('Blowing my mind yet again')]), Tr([Td(0.0), Td('Should have gone with Dunder Mifflin')]), Tr([Td(1.0), Td('Made me smile  Great work')]), Tr([Td(1.0), Td('The mad methodical geniuses do it again ')]), Tr([Td(1.0), Td('Am I the only person who actually loves their music and not just the video ')]), Tr([Td(1.0), Td('The ad was the best   the store owner saved alot on payroll    hire her and it s like hiring six people   ')]), Tr([Td(-1.0), Td('Waste of ink and paper     ')])])

In [174]:
filtered_df.head(3)

Unnamed: 0,label,comment
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...
3,1.0,Blowing my mind yet again
5,1.0,Made me smile Great work


In [171]:
sel.get("Positive")

1.0

In [172]:
simple_df = data[["label","comment"]]
simple_df.head(3)

Unnamed: 0,label,comment
0,-1.0,Everyone knows brand s papers from But No on...
1,0.0,Your paper cut balance is
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...


In [175]:
most_freq_pos.index

Index(['germany', 'brazil', 'love', 'best', 'amazing', 'team', 'great',
       'football', 'day', 'never'],
      dtype='object')

In [176]:
most_freq_pos.values

array([8, 6, 5, 4, 3, 3, 3, 3, 3, 2])

In [177]:
most_freq_neg.index

Index(['still', 'hurts'], dtype='object')

In [196]:
mydict = {'label_lr': 'Logistic Regression', 'label_mnb':'Multinomial Naive Bayes', 
'label_svm':'Support Vector Machine', 'label_rf': 'Random Forest', 'label_knn': 'K-Nearest Neighbor'}

In [198]:
print(mydict.get('label_lr'))

Logistic Regression
