# Bayesian Final Project

## Sudeepti Surapaneni, Sania Rasheed, Arti Patel

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import math
import pymc3 as pm
import arviz as az
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
import re



In [2]:
#os.chdir('C:/Users/Arti Patel/Desktop/bayesian_ml/final')
analyser = SentimentIntensityAnalyzer()

In [3]:
df = pd.read_csv('train-balanced-sarcasm.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df.sample(5)

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
704656,0,Those don't remove the fumes from film and pap...,kwirky88,Darkroom,2.0,2.0,0.0,2015-04,4/1/15 18:04,You could always try a few small standing filt...
368240,0,Insulation is not limited to timber constructi...,_I_Have_Opinions_,polandball,14.0,14.0,0.0,2016-04,4/15/16 15:03,"Our glorious Norwegian homes are made of wood,..."
7270,1,"Yes, please use the search bar to see if your ...",sombrez,meirl,8.0,-1.0,-1.0,2016-12,12/14/16 0:48,This has been posted before?
326996,0,I think it may also be that this picture has b...,fagnerd,pokemon,1.0,1.0,0.0,2016-08,8/24/16 1:45,Because memes have more value than good art. I...
630659,0,"I didn't say you are Hitler, but yes, what you...",andyetanotherkiwi,relationships,17.0,17.0,0.0,2015-07,7/21/15 19:22,Let's be reasonable and limit the hyperbole


In [5]:
df.shape

(1010827, 10)

### Initial Exploratory Analysis

In [6]:
#how many unique subreddits in original dataset
df['subreddit'].unique().shape

(14879,)

In [7]:
#cleaning label column:
#checking to see format of 1's and 0's: not all are numeric format, dropping 1 row that has no label value
proper_labels = ['0','1',0,1]
df = df.query('label in @proper_labels')
df['label'].unique()

array([0, 1, '1', '0'], dtype=object)

In [8]:
#new shape
df.shape

(1010826, 10)

In [9]:
#generate length of comment feature
df['comment'] = df['comment'].astype('str')
word_count = df['comment'].str.split()
df['comment_count'] = word_count.apply(len)

In [10]:
#generate length of parent column feature
df['parent_comment'] = df['parent_comment'].astype('str')
word_count_parent = df['parent_comment'].str.split()
df['parent_comment_count'] = word_count_parent.apply(len)

In [11]:
df.head(5)

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,comment_count,parent_comment_count
0,0,NC and NH.,Trumpbart,politics,2.0,-1.0,-1.0,2016-10,10/16/16 23:55,"Yeah, I get that argument. At this point, I'd ...",3,17
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4.0,-1.0,-1.0,2016-11,11/1/16 0:24,The blazers and Mavericks (The wests 5 and 6 s...,14,27
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3.0,3.0,0.0,2016-09,9/22/16 21:45,They're favored to win.,19,4
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8.0,-1.0,-1.0,2016-10,10/18/16 21:03,deadass don't kill my buzz,12,5
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6.0,-1.0,-1.0,2016-12,12/30/16 17:00,Yep can confirm I saw the tool they use for th...,7,18


In [12]:
#top subreddit pages (by no. of comments)
count_subreddit = df.groupby('subreddit').size().sort_values(ascending=False)
count_subreddit.head(15)

subreddit
AskReddit          65677
politics           39496
worldnews          26377
leagueoflegends    21037
pcmasterrace       18988
funny              17939
news               16891
pics               16154
todayilearned      14161
nfl                14150
nba                14146
GlobalOffensive    13740
AdviceAnimals      13483
videos             12320
gaming             11906
dtype: int64

In [13]:
#make new df with just the top ten subreddits (we leave out the "pics" subreddit)
subreddit_subset = ['AskReddit','politics','worldnews','leagueoflegends','pcmasterrace','funny','news','todayilearned','nfl','nba']
dfsub = df.query('subreddit in @subreddit_subset')

In [14]:
#shape of new df with subset of subreddits
dfsub.shape

(248862, 12)

In [15]:
dfsub[dfsub['label']==1].shape

(121819, 12)

In [16]:
dfsub[dfsub['label']=='1'].shape

(8284, 12)

In [17]:
dfsub[dfsub['label']==0].shape

(111223, 12)

In [18]:
dfsub[dfsub['label']=='0'].shape

(7536, 12)

In [19]:
#new subset df is 47% non sarcastic vs 53% sarcastic - almost balanced

In [20]:
#group by subreddit to see which subreddits are most sarcastic
sarcasm_subreddit = df[['subreddit','label']].groupby('subreddit').sum()
sarcasm_subreddit['label'] = sarcasm_subreddit['label'].astype('float64')

In [21]:
sarcasm_subreddit = sarcasm_subreddit.sort_values(by='label', ascending=False)
sarcasm_subreddit.head(15)

Unnamed: 0_level_0,label
subreddit,Unnamed: 1_level_1
pokemongo,1149.0
EnoughTrumpSpam,821.0
reddit.com,768.0
ClashRoyale,683.0
battlefield_one,261.0
uncensorednews,245.0
Infinitewarfare,173.0
enoughsandersspam,163.0
deadbydaylight,157.0
TickTockManitowoc,147.0


In [22]:
#download package that can tokenize words and create bigrams and trigrams
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/su/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [23]:
#test to tokenize words
w = nltk.word_tokenize(df['comment'][0])
w
b = list(nltk.bigrams(w))
b

[('NC', 'and'), ('and', 'NH'), ('NH', '.')]

In [24]:
my_bigrams = [list(nltk.bigrams(i)) for i in word_count]

In [25]:
df['bigrams'] = my_bigrams

In [26]:
my_trigrams = [list(nltk.trigrams(i)) for i in word_count]

In [27]:
df['trigrams'] = my_trigrams

In [28]:
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,comment_count,parent_comment_count,bigrams,trigrams
0,0,NC and NH.,Trumpbart,politics,2.0,-1.0,-1.0,2016-10,10/16/16 23:55,"Yeah, I get that argument. At this point, I'd ...",3,17,"[(NC, and), (and, NH.)]","[(NC, and, NH.)]"
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4.0,-1.0,-1.0,2016-11,11/1/16 0:24,The blazers and Mavericks (The wests 5 and 6 s...,14,27,"[(You, do), (do, know), (know, west), (west, t...","[(You, do, know), (do, know, west), (know, wes..."
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3.0,3.0,0.0,2016-09,9/22/16 21:45,They're favored to win.,19,4,"[(They, were), (were, underdogs), (underdogs, ...","[(They, were, underdogs), (were, underdogs, ea..."
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8.0,-1.0,-1.0,2016-10,10/18/16 21:03,deadass don't kill my buzz,12,5,"[(This, meme), (meme, isn't), (isn't, funny), ...","[(This, meme, isn't), (meme, isn't, funny), (i..."
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6.0,-1.0,-1.0,2016-12,12/30/16 17:00,Yep can confirm I saw the tool they use for th...,7,18,"[(I, could), (could, use), (use, one), (one, o...","[(I, could, use), (could, use, one), (use, one..."


In [29]:
df['bigram_sent'] = df.apply(lambda x: [], axis=1)

In [30]:
#gets polarity score for each bigram in comment column, takes a few min to run
for index, row in df.iterrows():
    mylist = row['bigrams']
    for j in mylist:
        bi_one = ''.join(j)
        #t = mylist.index(j)
        df['bigram_sent'][index].append(analyser.polarity_scores(bi_one))

In [31]:
df['bigram_sent'].sample(25)

913324    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
487578    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
880235    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
725174    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
719541    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
944355    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
640808    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
674965    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
373073    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
149452    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
581652    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
621462    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
534248    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
930725    [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
479511                                                   []
62914     [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...
487052    [{'neg': 0.0, 'neu': 1.0, 'pos

In [32]:
#polarity scores doesn't seem to work on bigrams - we think this may be because there are a lot of "filler/function" words
#still remaining in text that prevent vader from picking up on positive/negative sentiment
#because we did not find a useful feature out of this, we did not continue finding sentiment in trigrams, the following 3
#cells are markdown because they are not used for the purposes of this project/model

df['trigram_sent'] = df.apply(lambda x: [], axis=1)

for index, row in df.iterrows():
    mylist = row['trigrams']
    for j in mylist:
        tri_one = ''.join(j)
        #t = mylist.index(j)
        df['trigram_sent'][index].append(analyser.polarity_scores(tri_one))

df['trigram_sent'].sample(25)

In [33]:
# use vader tool to get polarity scores on comments as a whole
p = [analyser.polarity_scores(i) for i in dfsub['comment']]
dfsub['polarity'] = p

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [34]:
#break out the individual neg, pos, neutral and compound scores and make columns for each
l = list(dfsub['polarity'])
dfsub['neg'] = [i.get('neg') for i in l]
dfsub['neu'] = [i.get('neu') for i in l]
dfsub['pos'] = [i.get('pos') for i in l]
dfsub['compound'] = [i.get('compound') for i in l]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [37]:
#display
dfsub.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,comment_count,parent_comment_count,polarity,neg,neu,pos,compound,pcp
0,0,NC and NH.,Trumpbart,politics,2.0,-1.0,-1.0,2016-10,10/16/16 23:55,"Yeah, I get that argument. At this point, I'd ...",3,17,"{'neg': 0.0, 'neu': 0.337, 'pos': 0.663, 'comp...",0.0,0.337,0.663,0.6037,"{'neg': 0.126, 'neu': 0.657, 'pos': 0.217, 'co..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4.0,-1.0,-1.0,2016-11,11/1/16 0:24,The blazers and Mavericks (The wests 5 and 6 s...,14,27,"{'neg': 0.0, 'neu': 0.844, 'pos': 0.156, 'comp...",0.0,0.844,0.156,0.34,"{'neg': 0.095, 'neu': 0.905, 'pos': 0.0, 'comp..."
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3.0,3.0,0.0,2016-09,9/22/16 21:45,They're favored to win.,19,4,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"{'neg': 0.0, 'neu': 0.233, 'pos': 0.767, 'comp..."
5,0,"I don't pay attention to her, but as long as s...",only7inches,AskReddit,0.0,0.0,0.0,2016-09,9/2/16 10:35,do you find ariana grande sexy ?,24,7,"{'neg': 0.0, 'neu': 0.868, 'pos': 0.132, 'comp...",0.0,0.868,0.132,0.2259,"{'neg': 0.0, 'neu': 0.595, 'pos': 0.405, 'comp..."
6,0,Trick or treating in general is just weird...,only7inches,AskReddit,1.0,-1.0,-1.0,2016-10,10/23/16 21:43,What's your weird or unsettling Trick or Treat...,8,9,"{'neg': 0.146, 'neu': 0.854, 'pos': 0.0, 'comp...",0.146,0.854,0.0,-0.0516,"{'neg': 0.25, 'neu': 0.517, 'pos': 0.233, 'com..."


In [36]:
#same process for parent comment - vader tool implemented to find parent comment polarity scores:
pcp = [analyser.polarity_scores(i) for i in dfsub['parent_comment']]
dfsub['pcp'] = pcp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
dfsub.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,comment_count,parent_comment_count,polarity,neg,neu,pos,compound,pcp
0,0,NC and NH.,Trumpbart,politics,2.0,-1.0,-1.0,2016-10,10/16/16 23:55,"Yeah, I get that argument. At this point, I'd ...",3,17,"{'neg': 0.0, 'neu': 0.337, 'pos': 0.663, 'comp...",0.0,0.337,0.663,0.6037,"{'neg': 0.126, 'neu': 0.657, 'pos': 0.217, 'co..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4.0,-1.0,-1.0,2016-11,11/1/16 0:24,The blazers and Mavericks (The wests 5 and 6 s...,14,27,"{'neg': 0.0, 'neu': 0.844, 'pos': 0.156, 'comp...",0.0,0.844,0.156,0.34,"{'neg': 0.095, 'neu': 0.905, 'pos': 0.0, 'comp..."
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3.0,3.0,0.0,2016-09,9/22/16 21:45,They're favored to win.,19,4,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"{'neg': 0.0, 'neu': 0.233, 'pos': 0.767, 'comp..."
5,0,"I don't pay attention to her, but as long as s...",only7inches,AskReddit,0.0,0.0,0.0,2016-09,9/2/16 10:35,do you find ariana grande sexy ?,24,7,"{'neg': 0.0, 'neu': 0.868, 'pos': 0.132, 'comp...",0.0,0.868,0.132,0.2259,"{'neg': 0.0, 'neu': 0.595, 'pos': 0.405, 'comp..."
6,0,Trick or treating in general is just weird...,only7inches,AskReddit,1.0,-1.0,-1.0,2016-10,10/23/16 21:43,What's your weird or unsettling Trick or Treat...,8,9,"{'neg': 0.146, 'neu': 0.854, 'pos': 0.0, 'comp...",0.146,0.854,0.0,-0.0516,"{'neg': 0.25, 'neu': 0.517, 'pos': 0.233, 'com..."


In [38]:
p = list(dfsub['pcp'])

In [39]:
dfsub['pc_neg'] = [j.get('neg') for j in p]
dfsub['pc_neu'] = [j.get('neu') for j in p]
dfsub['pc_pos'] = [j.get('pos') for j in p]
dfsub['pc_compund'] = [j.get('compound') for j in p]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [40]:
dfsub.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,...,polarity,neg,neu,pos,compound,pcp,pc_neg,pc_neu,pc_pos,pc_compund
0,0,NC and NH.,Trumpbart,politics,2.0,-1.0,-1.0,2016-10,10/16/16 23:55,"Yeah, I get that argument. At this point, I'd ...",...,"{'neg': 0.0, 'neu': 0.337, 'pos': 0.663, 'comp...",0.0,0.337,0.663,0.6037,"{'neg': 0.126, 'neu': 0.657, 'pos': 0.217, 'co...",0.126,0.657,0.217,0.2023
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4.0,-1.0,-1.0,2016-11,11/1/16 0:24,The blazers and Mavericks (The wests 5 and 6 s...,...,"{'neg': 0.0, 'neu': 0.844, 'pos': 0.156, 'comp...",0.0,0.844,0.156,0.34,"{'neg': 0.095, 'neu': 0.905, 'pos': 0.0, 'comp...",0.095,0.905,0.0,-0.3412
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3.0,3.0,0.0,2016-09,9/22/16 21:45,They're favored to win.,...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,"{'neg': 0.0, 'neu': 0.233, 'pos': 0.767, 'comp...",0.0,0.233,0.767,0.765
5,0,"I don't pay attention to her, but as long as s...",only7inches,AskReddit,0.0,0.0,0.0,2016-09,9/2/16 10:35,do you find ariana grande sexy ?,...,"{'neg': 0.0, 'neu': 0.868, 'pos': 0.132, 'comp...",0.0,0.868,0.132,0.2259,"{'neg': 0.0, 'neu': 0.595, 'pos': 0.405, 'comp...",0.0,0.595,0.405,0.5267
6,0,Trick or treating in general is just weird...,only7inches,AskReddit,1.0,-1.0,-1.0,2016-10,10/23/16 21:43,What's your weird or unsettling Trick or Treat...,...,"{'neg': 0.146, 'neu': 0.854, 'pos': 0.0, 'comp...",0.146,0.854,0.0,-0.0516,"{'neg': 0.25, 'neu': 0.517, 'pos': 0.233, 'com...",0.25,0.517,0.233,0.2023


In [41]:
#create column that indicates any elongation in words
def has_long(sentence):
   elong = re.compile("([a-zA-Z])\\1{2,}")
   return bool(elong.search(sentence))

#df.apply(has_long(df.comment.values), axis=0)
dfsub['elong'] = [has_long(i) for i in dfsub['comment']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [42]:
#count of comments containing elongated words
dfsub[dfsub['elong']==True].shape

(2765, 23)

In [43]:
#write csv to new file
dfsub.to_csv('dfsub.csv', header=True, index=False)