## This notebook shows the Exploratory Data Analysis process and cleaning process of both reddit CSV's

In [1]:
#import necessary libraries
import numpy as np
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [1]:
#import data
confessions = pd.read_csv('./reddit_datasets/confession_data.csv', index_col = 'Unnamed: 0')
ventings = pd.read_csv('./reddit_datasets/venting_data.csv', index_col = 'Unnamed: 0')

NameError: name 'pd' is not defined

In [2]:
#check df
confessions.head()

NameError: name 'confessions' is not defined

In [3]:
ventings.head()

NameError: name 'ventings' is not defined

## Create new column to be used a our target feature

In [5]:
#confession posts will be set to equal 1
confessions['sub_reddit'] = 1

In [6]:
#venting posts will be set to equal 2
ventings['sub_reddit'] = 0

In [7]:
#check shape
confessions.shape


(1024, 100)

In [8]:
#drop duplicate scraped posts based on after id
confessions.drop_duplicates(subset = 'id', keep = 'first', inplace = True)

In [9]:
#check if rows were dropped
confessions.shape

(974, 100)

In [10]:
#check shape of ventings df
ventings.shape

(1072, 100)

In [11]:
##drop duplicate scraped posts based on after id
ventings.drop_duplicates(subset = 'id', keep = 'first', inplace = True)

In [12]:
#check to see if rows were dropped
ventings.shape

(997, 100)

In [13]:
#get only necessary text columns from data frame
ventings = ventings[['selftext', 'title', 'timestamp', 'sub_reddit']]

In [14]:
#get only necessary text columns from data frame
confessions = confessions[['selftext', 'title', 'timestamp', 'sub_reddit']]

In [15]:
#combine both data frames to later be train/test split
df = pd.concat([confessions, ventings])
df.shape

(1971, 4)

In [16]:
#most values will share a same index when you concatenate the dataframes
#we need to reset indexes so there will not be two values to one index on most of the values
df.reset_index(inplace=True,drop=True)

In [17]:
df.head()

Unnamed: 0,selftext,title,timestamp,sub_reddit
0,"We were at the park, kids were taking turns go...",My toddler chest-kicked another kid into the a...,2019-07-08 05:13:30,1
1,I’m not a good looking guy and I don’t say it ...,The most female interaction I’ve ever had in m...,2019-07-08 08:23:10,1
2,I know a gal who has two kids and is pregnant ...,I think people with genetic conditions that th...,2019-07-08 20:15:16,1
3,This is pretty hard for me to post. Ive never ...,I almost killed my step-mom..,2019-07-07 16:50:52,1
4,"After livelier-than-average sex, my wife said ...",My Wife Asked a Loaded Question and I Lied to Her,2019-07-08 21:46:59,1


## Check for null values

In [18]:
print(df['selftext'].isnull().sum())

136


In [19]:
print(df['title'].isnull().sum())

0


In [1]:
#null values in the body will be due to pictures of no values so do not drop
#change null to a neutral word 
df.loc[df["selftext"].isnull(), 'selftext'] = 'picture'

NameError: name 'df' is not defined

In [21]:
#check if any nulls
df['selftext'].isnull().sum()

0

## Feature Engineering

In [22]:
#combine title and selftext to make another feature
df['t_s'] = df['title'] + df['selftext']  

In [23]:
df.columns

Index(['selftext', 'title', 'timestamp', 'sub_reddit', 't_s'], dtype='object')

In [24]:
#instatiate Vader
analyzer = SentimentIntensityAnalyzer()

In [25]:
#test vader to see if it works
analyzer.polarity_scores(df['t_s'][3])['compound']

-0.9978

## Create a loop to see if Vader runs correctly on each post

In [29]:
#create a loop to give us back the sentiment analysis score 
#for each self text post
scores = []
for post in range(len(df['selftext'])):
    print(post)
    print(analyzer.polarity_scores(df['selftext'][post]))

# scores_df = pd.DataFrame(list_of_scores)

0
{'neg': 0.045, 'neu': 0.83, 'pos': 0.125, 'compound': 0.9861}
1
{'neg': 0.044, 'neu': 0.782, 'pos': 0.173, 'compound': 0.9939}
2
{'neg': 0.197, 'neu': 0.727, 'pos': 0.076, 'compound': -0.9905}
3
{'neg': 0.143, 'neu': 0.741, 'pos': 0.116, 'compound': -0.9976}
4
{'neg': 0.15, 'neu': 0.717, 'pos': 0.132, 'compound': -0.847}
5
{'neg': 0.076, 'neu': 0.793, 'pos': 0.131, 'compound': 0.9665}
6
{'neg': 0.3, 'neu': 0.656, 'pos': 0.044, 'compound': -0.9874}
7
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
8
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
9
{'neg': 0.068, 'neu': 0.742, 'pos': 0.191, 'compound': 0.768}
10
{'neg': 0.049, 'neu': 0.797, 'pos': 0.154, 'compound': 0.8349}
11
{'neg': 0.029, 'neu': 0.877, 'pos': 0.094, 'compound': 0.9273}
12
{'neg': 0.042, 'neu': 0.838, 'pos': 0.119, 'compound': 0.9382}
13
{'neg': 0.086, 'neu': 0.787, 'pos': 0.127, 'compound': 0.7755}
14
{'neg': 0.129, 'neu': 0.717, 'pos': 0.154, 'compound': 0.3446}
15
{'neg': 0.171, 'neu': 0.689, 'pos': 0

{'neg': 0.096, 'neu': 0.8, 'pos': 0.104, 'compound': 0.1686}
133
{'neg': 0.095, 'neu': 0.843, 'pos': 0.062, 'compound': -0.7168}
134
{'neg': 0.158, 'neu': 0.808, 'pos': 0.034, 'compound': -0.9875}
135
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
136
{'neg': 0.171, 'neu': 0.71, 'pos': 0.119, 'compound': -0.8701}
137
{'neg': 0.097, 'neu': 0.787, 'pos': 0.116, 'compound': 0.3714}
138
{'neg': 0.32, 'neu': 0.634, 'pos': 0.046, 'compound': -0.9796}
139
{'neg': 0.0, 'neu': 0.679, 'pos': 0.321, 'compound': 0.8176}
140
{'neg': 0.169, 'neu': 0.831, 'pos': 0.0, 'compound': -0.961}
141
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
142
{'neg': 0.142, 'neu': 0.773, 'pos': 0.085, 'compound': -0.9997}
143
{'neg': 0.114, 'neu': 0.839, 'pos': 0.047, 'compound': -0.8534}
144
{'neg': 0.227, 'neu': 0.638, 'pos': 0.135, 'compound': -0.8818}
145
{'neg': 0.081, 'neu': 0.858, 'pos': 0.061, 'compound': -0.8588}
146
{'neg': 0.158, 'neu': 0.698, 'pos': 0.144, 'compound': -0.2033}
147
{'neg': 0.53

{'neg': 0.13, 'neu': 0.79, 'pos': 0.08, 'compound': -0.9973}
289
{'neg': 0.0, 'neu': 0.855, 'pos': 0.145, 'compound': 0.9054}
290
{'neg': 0.047, 'neu': 0.872, 'pos': 0.081, 'compound': 0.8554}
291
{'neg': 0.09, 'neu': 0.857, 'pos': 0.053, 'compound': -0.5289}
292
{'neg': 0.28, 'neu': 0.675, 'pos': 0.045, 'compound': -0.9729}
293
{'neg': 0.307, 'neu': 0.607, 'pos': 0.085, 'compound': -0.6621}
294
{'neg': 0.232, 'neu': 0.706, 'pos': 0.062, 'compound': -0.8176}
295
{'neg': 0.184, 'neu': 0.704, 'pos': 0.112, 'compound': -0.394}
296
{'neg': 0.0, 'neu': 0.696, 'pos': 0.304, 'compound': 0.7184}
297
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
298
{'neg': 0.124, 'neu': 0.736, 'pos': 0.14, 'compound': 0.4767}
299
{'neg': 0.067, 'neu': 0.779, 'pos': 0.154, 'compound': 0.9852}
300
{'neg': 0.048, 'neu': 0.916, 'pos': 0.036, 'compound': -0.0018}
301
{'neg': 0.169, 'neu': 0.708, 'pos': 0.123, 'compound': -0.7374}
302
{'neg': 0.196, 'neu': 0.773, 'pos': 0.031, 'compound': -0.7783}
303
{'neg'

{'neg': 0.147, 'neu': 0.709, 'pos': 0.144, 'compound': 0.4981}
423
{'neg': 0.149, 'neu': 0.71, 'pos': 0.141, 'compound': -0.808}
424
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
425
{'neg': 0.196, 'neu': 0.642, 'pos': 0.162, 'compound': -0.6816}
426
{'neg': 0.054, 'neu': 0.735, 'pos': 0.211, 'compound': 0.9915}
427
{'neg': 0.0, 'neu': 0.739, 'pos': 0.261, 'compound': 0.6486}
428
{'neg': 0.151, 'neu': 0.849, 'pos': 0.0, 'compound': -0.4939}
429
{'neg': 0.283, 'neu': 0.628, 'pos': 0.089, 'compound': -0.6774}
430
{'neg': 0.276, 'neu': 0.476, 'pos': 0.248, 'compound': -0.1531}
431
{'neg': 0.055, 'neu': 0.721, 'pos': 0.224, 'compound': 0.9548}
432
{'neg': 0.026, 'neu': 0.84, 'pos': 0.134, 'compound': 0.891}
433
{'neg': 0.059, 'neu': 0.838, 'pos': 0.103, 'compound': 0.659}
434
{'neg': 0.181, 'neu': 0.626, 'pos': 0.193, 'compound': 0.046}
435
{'neg': 0.117, 'neu': 0.818, 'pos': 0.065, 'compound': -0.8627}
436
{'neg': 0.242, 'neu': 0.691, 'pos': 0.067, 'compound': -0.8225}
437
{'neg':

{'neg': 0.056, 'neu': 0.846, 'pos': 0.098, 'compound': 0.8591}
553
{'neg': 0.281, 'neu': 0.608, 'pos': 0.11, 'compound': -0.6705}
554
{'neg': 0.119, 'neu': 0.819, 'pos': 0.062, 'compound': -0.7172}
555
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
556
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
557
{'neg': 0.014, 'neu': 0.947, 'pos': 0.038, 'compound': 0.5719}
558
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
559
{'neg': 0.152, 'neu': 0.725, 'pos': 0.124, 'compound': -0.9679}
560
{'neg': 0.051, 'neu': 0.699, 'pos': 0.25, 'compound': 0.998}
561
{'neg': 0.108, 'neu': 0.892, 'pos': 0.0, 'compound': -0.6369}
562
{'neg': 0.126, 'neu': 0.836, 'pos': 0.038, 'compound': -0.653}
563
{'neg': 0.081, 'neu': 0.679, 'pos': 0.239, 'compound': 0.9884}
564
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
565
{'neg': 0.142, 'neu': 0.722, 'pos': 0.136, 'compound': -0.4172}
566
{'neg': 0.193, 'neu': 0.694, 'pos': 0.113, 'compound': -0.9591}
567
{'neg': 0.078, 'neu': 0.886, '

{'neg': 0.109, 'neu': 0.77, 'pos': 0.121, 'compound': 0.2855}
730
{'neg': 0.156, 'neu': 0.844, 'pos': 0.0, 'compound': -0.9739}
731
{'neg': 0.132, 'neu': 0.768, 'pos': 0.1, 'compound': -0.9906}
732
{'neg': 0.013, 'neu': 0.955, 'pos': 0.032, 'compound': 0.723}
733
{'neg': 0.106, 'neu': 0.872, 'pos': 0.022, 'compound': -0.6633}
734
{'neg': 0.129, 'neu': 0.82, 'pos': 0.052, 'compound': -0.9949}
735
{'neg': 0.148, 'neu': 0.756, 'pos': 0.097, 'compound': -0.9607}
736
{'neg': 0.105, 'neu': 0.767, 'pos': 0.128, 'compound': 0.7598}
737
{'neg': 0.067, 'neu': 0.877, 'pos': 0.056, 'compound': -0.0258}
738
{'neg': 0.251, 'neu': 0.613, 'pos': 0.137, 'compound': -0.3446}
739
{'neg': 0.243, 'neu': 0.715, 'pos': 0.042, 'compound': -0.8791}
740
{'neg': 0.167, 'neu': 0.724, 'pos': 0.109, 'compound': -0.8228}
741
{'neg': 0.275, 'neu': 0.725, 'pos': 0.0, 'compound': -0.5848}
742
{'neg': 0.267, 'neu': 0.643, 'pos': 0.09, 'compound': -0.9935}
743
{'neg': 0.086, 'neu': 0.806, 'pos': 0.107, 'compound': 0.9334

{'neg': 0.074, 'neu': 0.744, 'pos': 0.182, 'compound': 0.9893}
858
{'neg': 0.191, 'neu': 0.707, 'pos': 0.101, 'compound': -0.9594}
859
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
860
{'neg': 0.088, 'neu': 0.912, 'pos': 0.0, 'compound': -0.5719}
861
{'neg': 0.177, 'neu': 0.823, 'pos': 0.0, 'compound': -0.6801}
862
{'neg': 0.053, 'neu': 0.916, 'pos': 0.03, 'compound': -0.1608}
863
{'neg': 0.051, 'neu': 0.93, 'pos': 0.019, 'compound': -0.534}
864
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
865
{'neg': 0.269, 'neu': 0.662, 'pos': 0.069, 'compound': -0.9823}
866
{'neg': 0.09, 'neu': 0.748, 'pos': 0.161, 'compound': 0.7343}
867
{'neg': 0.178, 'neu': 0.73, 'pos': 0.092, 'compound': -0.7622}
868
{'neg': 0.011, 'neu': 0.979, 'pos': 0.009, 'compound': -0.0745}
869
{'neg': 0.067, 'neu': 0.609, 'pos': 0.324, 'compound': 0.9968}
870
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
871
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
872
{'neg': 0.079, 'neu': 0.873, 'po

{'neg': 0.125, 'neu': 0.765, 'pos': 0.11, 'compound': -0.7513}
1008
{'neg': 0.179, 'neu': 0.632, 'pos': 0.19, 'compound': 0.0795}
1009
{'neg': 0.182, 'neu': 0.704, 'pos': 0.115, 'compound': -0.9909}
1010
{'neg': 0.158, 'neu': 0.675, 'pos': 0.167, 'compound': -0.6945}
1011
{'neg': 0.076, 'neu': 0.838, 'pos': 0.086, 'compound': 0.2159}
1012
{'neg': 0.078, 'neu': 0.71, 'pos': 0.212, 'compound': 0.9876}
1013
{'neg': 0.112, 'neu': 0.67, 'pos': 0.219, 'compound': 0.796}
1014
{'neg': 0.139, 'neu': 0.773, 'pos': 0.088, 'compound': -0.8904}
1015
{'neg': 0.22, 'neu': 0.767, 'pos': 0.013, 'compound': -0.9545}
1016
{'neg': 0.198, 'neu': 0.696, 'pos': 0.106, 'compound': -0.9347}
1017
{'neg': 0.064, 'neu': 0.868, 'pos': 0.068, 'compound': -0.3714}
1018
{'neg': 0.226, 'neu': 0.632, 'pos': 0.142, 'compound': -0.9822}
1019
{'neg': 0.316, 'neu': 0.57, 'pos': 0.114, 'compound': -0.815}
1020
{'neg': 0.162, 'neu': 0.694, 'pos': 0.144, 'compound': 0.3071}
1021
{'neg': 0.114, 'neu': 0.758, 'pos': 0.128, 'com

{'neg': 0.209, 'neu': 0.61, 'pos': 0.18, 'compound': -0.9751}
1140
{'neg': 0.221, 'neu': 0.669, 'pos': 0.11, 'compound': -0.9953}
1141
{'neg': 0.23, 'neu': 0.642, 'pos': 0.128, 'compound': -0.9939}
1142
{'neg': 0.131, 'neu': 0.785, 'pos': 0.083, 'compound': -0.9181}
1143
{'neg': 0.118, 'neu': 0.775, 'pos': 0.107, 'compound': -0.3253}
1144
{'neg': 0.073, 'neu': 0.663, 'pos': 0.264, 'compound': 0.9926}
1145
{'neg': 0.164, 'neu': 0.762, 'pos': 0.075, 'compound': -0.9943}
1146
{'neg': 0.219, 'neu': 0.59, 'pos': 0.191, 'compound': -0.9144}
1147
{'neg': 0.133, 'neu': 0.722, 'pos': 0.146, 'compound': 0.4263}
1148
{'neg': 0.264, 'neu': 0.736, 'pos': 0.0, 'compound': -0.9843}
1149
{'neg': 0.206, 'neu': 0.743, 'pos': 0.052, 'compound': -0.9936}
1150
{'neg': 0.19, 'neu': 0.754, 'pos': 0.056, 'compound': -0.9916}
1151
{'neg': 0.054, 'neu': 0.776, 'pos': 0.17, 'compound': 0.984}
1152
{'neg': 0.016, 'neu': 0.809, 'pos': 0.175, 'compound': 0.9806}
1153
{'neg': 0.068, 'neu': 0.86, 'pos': 0.072, 'compo

{'neg': 0.131, 'neu': 0.736, 'pos': 0.132, 'compound': 0.6585}
1284
{'neg': 0.304, 'neu': 0.616, 'pos': 0.081, 'compound': -0.9717}
1285
{'neg': 0.0, 'neu': 0.7, 'pos': 0.3, 'compound': 0.9754}
1286
{'neg': 0.066, 'neu': 0.662, 'pos': 0.272, 'compound': 0.978}
1287
{'neg': 0.358, 'neu': 0.56, 'pos': 0.083, 'compound': -0.9993}
1288
{'neg': 0.023, 'neu': 0.977, 'pos': 0.0, 'compound': -0.0516}
1289
{'neg': 0.186, 'neu': 0.695, 'pos': 0.119, 'compound': -0.9978}
1290
{'neg': 0.221, 'neu': 0.679, 'pos': 0.1, 'compound': -0.7496}
1291
{'neg': 0.16, 'neu': 0.639, 'pos': 0.201, 'compound': 0.9468}
1292
{'neg': 0.231, 'neu': 0.625, 'pos': 0.144, 'compound': -0.9734}
1293
{'neg': 0.159, 'neu': 0.782, 'pos': 0.059, 'compound': -0.9913}
1294
{'neg': 0.0, 'neu': 0.965, 'pos': 0.035, 'compound': 0.5719}
1295
{'neg': 0.15, 'neu': 0.702, 'pos': 0.148, 'compound': 0.7194}
1296
{'neg': 0.153, 'neu': 0.706, 'pos': 0.142, 'compound': -0.1512}
1297
{'neg': 0.0, 'neu': 0.706, 'pos': 0.294, 'compound': 0.3

{'neg': 0.136, 'neu': 0.743, 'pos': 0.122, 'compound': -0.9619}
1427
{'neg': 0.211, 'neu': 0.614, 'pos': 0.174, 'compound': -0.581}
1428
{'neg': 0.106, 'neu': 0.825, 'pos': 0.07, 'compound': -0.7377}
1429
{'neg': 0.052, 'neu': 0.769, 'pos': 0.179, 'compound': 0.9893}
1430
{'neg': 0.158, 'neu': 0.71, 'pos': 0.132, 'compound': -0.9672}
1431
{'neg': 0.032, 'neu': 0.799, 'pos': 0.169, 'compound': 0.9887}
1432
{'neg': 0.07, 'neu': 0.815, 'pos': 0.115, 'compound': 0.836}
1433
{'neg': 0.164, 'neu': 0.762, 'pos': 0.074, 'compound': -0.872}
1434
{'neg': 0.165, 'neu': 0.818, 'pos': 0.017, 'compound': -0.9228}
1435
{'neg': 0.081, 'neu': 0.787, 'pos': 0.132, 'compound': 0.574}
1436
{'neg': 0.071, 'neu': 0.857, 'pos': 0.072, 'compound': -0.2278}
1437
{'neg': 0.128, 'neu': 0.734, 'pos': 0.138, 'compound': 0.7432}
1438
{'neg': 0.139, 'neu': 0.802, 'pos': 0.059, 'compound': -0.8549}
1439
{'neg': 0.127, 'neu': 0.783, 'pos': 0.09, 'compound': -0.6943}
1440
{'neg': 0.171, 'neu': 0.737, 'pos': 0.092, 'com

{'neg': 0.088, 'neu': 0.77, 'pos': 0.141, 'compound': 0.9997}
1552
{'neg': 0.054, 'neu': 0.835, 'pos': 0.111, 'compound': 0.9211}
1553
{'neg': 0.079, 'neu': 0.786, 'pos': 0.135, 'compound': 0.9569}
1554
{'neg': 0.093, 'neu': 0.774, 'pos': 0.132, 'compound': 0.9534}
1555
{'neg': 0.048, 'neu': 0.662, 'pos': 0.289, 'compound': 0.9992}
1556
{'neg': 0.213, 'neu': 0.718, 'pos': 0.069, 'compound': -0.9996}
1557
{'neg': 0.124, 'neu': 0.743, 'pos': 0.134, 'compound': 0.864}
1558
{'neg': 0.292, 'neu': 0.629, 'pos': 0.079, 'compound': -0.9878}
1559
{'neg': 0.127, 'neu': 0.822, 'pos': 0.05, 'compound': -0.8664}
1560
{'neg': 0.075, 'neu': 0.746, 'pos': 0.179, 'compound': 0.9907}
1561
{'neg': 0.261, 'neu': 0.739, 'pos': 0.0, 'compound': -0.8302}
1562
{'neg': 0.092, 'neu': 0.747, 'pos': 0.161, 'compound': 0.4818}
1563
{'neg': 0.059, 'neu': 0.816, 'pos': 0.125, 'compound': 0.5897}
1564
{'neg': 0.161, 'neu': 0.736, 'pos': 0.102, 'compound': -0.9957}
1565
{'neg': 0.139, 'neu': 0.762, 'pos': 0.099, 'comp

{'neg': 0.163, 'neu': 0.725, 'pos': 0.112, 'compound': -0.9906}
1700
{'neg': 0.036, 'neu': 0.809, 'pos': 0.155, 'compound': 0.9566}
1701
{'neg': 0.131, 'neu': 0.759, 'pos': 0.109, 'compound': -0.6993}
1702
{'neg': 0.144, 'neu': 0.694, 'pos': 0.162, 'compound': 0.1886}
1703
{'neg': 0.156, 'neu': 0.706, 'pos': 0.138, 'compound': -0.5307}
1704
{'neg': 0.279, 'neu': 0.687, 'pos': 0.034, 'compound': -0.8503}
1705
{'neg': 0.179, 'neu': 0.781, 'pos': 0.04, 'compound': -0.9625}
1706
{'neg': 0.189, 'neu': 0.729, 'pos': 0.081, 'compound': -0.9522}
1707
{'neg': 0.228, 'neu': 0.701, 'pos': 0.071, 'compound': -0.9704}
1708
{'neg': 0.094, 'neu': 0.654, 'pos': 0.252, 'compound': 0.8193}
1709
{'neg': 0.129, 'neu': 0.696, 'pos': 0.175, 'compound': 0.8272}
1710
{'neg': 0.136, 'neu': 0.584, 'pos': 0.28, 'compound': 0.8889}
1711
{'neg': 0.109, 'neu': 0.782, 'pos': 0.108, 'compound': -0.8367}
1712
{'neg': 0.091, 'neu': 0.883, 'pos': 0.026, 'compound': -0.9502}
1713
{'neg': 0.088, 'neu': 0.861, 'pos': 0.051

{'neg': 0.114, 'neu': 0.827, 'pos': 0.059, 'compound': -0.9988}
1836
{'neg': 0.13, 'neu': 0.693, 'pos': 0.177, 'compound': 0.7229}
1837
{'neg': 0.124, 'neu': 0.697, 'pos': 0.179, 'compound': 0.9314}
1838
{'neg': 0.228, 'neu': 0.734, 'pos': 0.038, 'compound': -0.9704}
1839
{'neg': 0.107, 'neu': 0.821, 'pos': 0.072, 'compound': -0.566}
1840
{'neg': 0.2, 'neu': 0.615, 'pos': 0.185, 'compound': -0.593}
1841
{'neg': 0.108, 'neu': 0.608, 'pos': 0.283, 'compound': 0.9985}
1842
{'neg': 0.015, 'neu': 0.802, 'pos': 0.183, 'compound': 0.9645}
1843
{'neg': 0.0, 'neu': 0.905, 'pos': 0.095, 'compound': 0.5526}
1844
{'neg': 0.125, 'neu': 0.756, 'pos': 0.119, 'compound': -0.3818}
1845
{'neg': 0.104, 'neu': 0.812, 'pos': 0.085, 'compound': -0.9973}
1846
{'neg': 0.168, 'neu': 0.66, 'pos': 0.172, 'compound': -0.4203}
1847
{'neg': 0.1, 'neu': 0.9, 'pos': 0.0, 'compound': -0.8263}
1848
{'neg': 0.0, 'neu': 0.94, 'pos': 0.06, 'compound': 0.1531}
1849
{'neg': 0.075, 'neu': 0.629, 'pos': 0.295, 'compound': 0.9

{'neg': 0.131, 'neu': 0.739, 'pos': 0.13, 'compound': 0.3568}
1967
{'neg': 0.087, 'neu': 0.751, 'pos': 0.162, 'compound': -0.0498}
1968
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
1969
{'neg': 0.257, 'neu': 0.688, 'pos': 0.055, 'compound': -0.9417}
1970
{'neg': 0.194, 'neu': 0.71, 'pos': 0.096, 'compound': -0.9976}


In [30]:
#create a loop to give us back the ONLY compound sentiment analysis score 
#for each self text post
scores = []
for post in range(len(df['selftext'])):
    print(post)
    print(analyzer.polarity_scores(df['selftext'][post])['compound'])



0
0.9861
1
0.9939
2
-0.9905
3
-0.9976
4
-0.847
5
0.9665
6
-0.9874
7
0.0
8
0.0
9
0.768
10
0.8349
11
0.9273
12
0.9382
13
0.7755
14
0.3446
15
-0.3907
16
-0.9491
17
0.9162
18
0.5478
19
0.9244
20
0.859
21
0.9953
22
0.0
23
-0.7762
24
0.0
25
-0.9757
26
0.0772
27
0.9802
28
-0.8
29
-0.9989
30
0.0
31
0.0
32
-0.296
33
0.6935
34
-0.6187
35
0.0
36
-0.7212
37
0.3048
38
-0.743
39
0.4404
40
-0.5267
41
-0.9995
42
0.0
43
-0.7922
44
0.9476
45
0.0
46
0.0
47
0.0
48
0.0
49
-0.836
50
-0.6378
51
0.0
52
-0.9882
53
0.9898
54
-0.989
55
-0.3289
56
0.9792
57
0.9709
58
0.0
59
-0.6901
60
-0.2263
61
0.2617
62
-0.3313
63
-0.9693
64
-0.4588
65
0.4945
66
-0.8185
67
-0.9548
68
-0.5681
69
0.0
70
0.8897
71
-0.9853
72
0.9584
73
0.929
74
0.4767
75
-0.8957
76
-0.2617
77
0.0
78
0.9485
79
0.6848
80
-0.5719
81
-0.9655
82
-0.947
83
0.8481
84
-0.3197
85
0.7883
86
-0.9136
87
-0.9981
88
-0.5525
89
0.0258
90
-0.977
91
-0.9918
92
-0.99
93
0.8555
94
-0.3716
95
0.0
96
0.9164
97
-0.942
98
0.7641
99
0.8879
100
0.9994
101
0.9417
102
-0.984

0.787
800
0.0
801
0.0
802
0.8963
803
-0.9832
804
0.0
805
0.903
806
-0.9931
807
0.7425
808
0.3134
809
-0.9963
810
-0.9971
811
0.3612
812
-0.9763
813
0.8957
814
-0.8922
815
0.1779
816
-0.9191
817
0.942
818
-0.8387
819
-0.8807
820
-0.9631
821
-0.1132
822
-0.9333
823
0.765
824
0.9635
825
-0.8443
826
-0.5709
827
-0.9861
828
-0.9967
829
0.8243
830
0.0
831
0.3182
832
0.9483
833
0.6542
834
0.7124
835
0.7189
836
-0.9987
837
0.8511
838
-0.9265
839
0.9969
840
0.0
841
0.0
842
0.5073
843
-0.8885
844
0.9636
845
0.0
846
0.5683
847
0.9867
848
0.9573
849
0.7648
850
0.0
851
0.6908
852
-0.9983
853
-0.6597
854
-0.9968
855
0.3818
856
0.5546
857
0.9893
858
-0.9594
859
0.0
860
-0.5719
861
-0.6801
862
-0.1608
863
-0.534
864
0.0
865
-0.9823
866
0.7343
867
-0.7622
868
-0.0745
869
0.9968
870
0.0
871
0.0
872
-0.5799
873
0.9274
874
0.147
875
-0.5439
876
-0.9987
877
-0.8552
878
-0.2706
879
0.2787
880
0.9597
881
0.0
882
-0.7201
883
-0.9823
884
-0.6369
885
-0.6113
886
0.2186
887
-0.3346
888
-0.9724
889
0.0
890
-0.876

-0.9272
1494
-0.9531
1495
0.9015
1496
0.6881
1497
-0.8954
1498
0.8942
1499
0.2263
1500
0.9892
1501
-0.978
1502
0.9638
1503
0.9957
1504
-0.767
1505
-0.9606
1506
-0.9611
1507
-0.6933
1508
-0.9145
1509
0.6808
1510
-0.0029
1511
0.2484
1512
0.9217
1513
0.6502
1514
0.2662
1515
-0.99
1516
0.796
1517
-0.7587
1518
-0.9876
1519
0.1204
1520
-0.5661
1521
-0.6195
1522
0.8376
1523
-0.9876
1524
0.6774
1525
-0.3944
1526
0.8201
1527
-0.9216
1528
0.9627
1529
-0.8689
1530
-0.7413
1531
-0.9906
1532
0.9991
1533
0.8772
1534
0.7184
1535
0.4631
1536
0.8765
1537
-0.9755
1538
-0.9043
1539
0.9277
1540
-0.9761
1541
0.9554
1542
0.6187
1543
-0.9928
1544
-0.9591
1545
0.0
1546
0.8927
1547
-0.9671
1548
0.9558
1549
-0.8954
1550
0.9829
1551
0.9997
1552
0.9211
1553
0.9569
1554
0.9534
1555
0.9992
1556
-0.9996
1557
0.864
1558
-0.9878
1559
-0.8664
1560
0.9907
1561
-0.8302
1562
0.4818
1563
0.5897
1564
-0.9957
1565
-0.9937
1566
-0.9887
1567
-0.6361
1568
0.9577
1569
0.4939
1570
-0.9736
1571
0.9801
1572
-0.8649
1573
-0.9155
157

## Create a list of sentiment analysis compound scores for each necessary column

In [32]:
#create a loop to create a new list of compound sentiment analysis score 
#for each self text post
sent_self_scores = []
for post in range(len(df['selftext'])):
    score = analyzer.polarity_scores(df['selftext'][post])['compound']
    sent_self_scores.append(score)
len(sent_self_scores)

1971

In [33]:
#create a loop to create a new list of compound sentiment analysis score 
#for each title post
sent_title_scores = []
for post in range(len(df['title'])):
    score = analyzer.polarity_scores(df['title'][post])['compound']
    sent_title_scores.append(score)

len(sent_title_scores)

1971

In [34]:
#create a loop to create a new list of compound sentiment analysis score 
#for each title + self text post
sent_ts_scores = []
for post in range(len(df['t_s'])):
    score = analyzer.polarity_scores(df['t_s'][post])['compound']
    sent_ts_scores.append(score)

len(sent_ts_scores)

1971

## Append them to the data fram to be used later

In [35]:
df['sent_selftext'] = sent_self_scores
df['sent_selftext'].head()

0    0.9861
1    0.9939
2   -0.9905
3   -0.9976
4   -0.8470
Name: sent_selftext, dtype: float64

In [36]:
df['sent_title'] = sent_title_scores
df['sent_title'].head()

0    0.3875
1    0.0000
2    0.0000
3   -0.6378
4   -0.3818
Name: sent_title, dtype: float64

In [37]:
df['sent_ts'] = sent_ts_scores
df['sent_ts'].head()

0    0.9861
1    0.9939
2   -0.9905
3   -0.9978
4   -0.8741
Name: sent_ts, dtype: float64

In [38]:
df.head()

Unnamed: 0,selftext,title,timestamp,sub_reddit,t_s,sent_selftext,sent_title,sent_ts
0,"We were at the park, kids were taking turns go...",My toddler chest-kicked another kid into the a...,2019-07-08 05:13:30,1,My toddler chest-kicked another kid into the a...,0.9861,0.3875,0.9861
1,I’m not a good looking guy and I don’t say it ...,The most female interaction I’ve ever had in m...,2019-07-08 08:23:10,1,The most female interaction I’ve ever had in m...,0.9939,0.0,0.9939
2,I know a gal who has two kids and is pregnant ...,I think people with genetic conditions that th...,2019-07-08 20:15:16,1,I think people with genetic conditions that th...,-0.9905,0.0,-0.9905
3,This is pretty hard for me to post. Ive never ...,I almost killed my step-mom..,2019-07-07 16:50:52,1,I almost killed my step-mom..This is pretty ha...,-0.9976,-0.6378,-0.9978
4,"After livelier-than-average sex, my wife said ...",My Wife Asked a Loaded Question and I Lied to Her,2019-07-08 21:46:59,1,My Wife Asked a Loaded Question and I Lied to ...,-0.847,-0.3818,-0.8741


## Fix the timestamp objects to only show the hour

In [39]:
df['timestamp'].head()

0    2019-07-08 05:13:30
1    2019-07-08 08:23:10
2    2019-07-08 20:15:16
3    2019-07-07 16:50:52
4    2019-07-08 21:46:59
Name: timestamp, dtype: object

In [52]:
#defined a function to only index the hour
def dt_to_hour(x):
    return x[11:13]

In [58]:
#map the function and create a new column called hour
df['hour'] = df['timestamp'].map(lambda x: x[11:13])

In [59]:
#test
df['hour'][1970]

'15'

## Export CSV for modeling and predicting

In [61]:
df.to_csv('./reddit_datasets/combined_data.csv')

In [None]:
features = [col for col in reddit.columns if col != 'target']

X = reddit[features]
y = reddit['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)

In [None]:
```reddit['target'] = reddit['num_comments'].apply(lambda x: 1 if x > 158 else 0)```

In [None]:
cvec = CountVectorizer()

X_train_vec = cvec.fit_transform(X_train['title']) 
X_test_vec = cvec.transform(X_test['title'])

In [None]:
X_train_df = pd.DataFrame(X_train_vec.toarray(), columns = cvec.get_feature_names())
X_test_df = pd.DataFrame(X_test_vec.toarray(), columns = cvec.get_feature_names())

In [None]:
list_of_scores = []

for post in reddit['title']:
    scores = sia.polarity_scores(post)
    list_of_scores.append(scores)
    
scores_df = pd.DataFrame(list_of_scores)