In [1]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import collections
import nltk
from nltk.corpus import stopwords
%matplotlib inline

In [2]:
sql_conn = sqlite3.connect('./database.sqlite')

## What type of posts are the most engaging?

In [None]:
def sentiment(x):
    blob = TextBlob(x)
    if blob.sentiment[0] >= 0.0:
        return 1 #positive
    else:
        return 0 #negative


def subjectivity(x):
    sentence = TextBlob(x)
    if sentence.sentiment[1]>= 0.5:
        return 1 #Subjective
    else:
        return 0 #Objective

First, we choose the top 500 voted comments with meaningful lengths.

In [None]:
df_up = pd.read_sql("""SELECT created_utc,author,score,subreddit, body,parent_id FROM May2015
                 WHERE LENGTH(body) > 30 AND LENGTH(body) < 10000 ORDER BY score DESC LIMIT 500""", sql_conn)
df_down = pd.read_sql("""SELECT created_utc,author,score,subreddit, body,parent_id FROM May2015
                 WHERE LENGTH(body) > 30 AND LENGTH(body) < 10000 ORDER BY score LIMIT 500""", sql_conn)

In [None]:
df_up["sentiment"] = df_up.body.apply(sentiment)
df_up["Objectivity"] = df_up.body.apply(subjectivity)

In [None]:
df_down["sentiment"] = df_down.body.apply(sentiment)
df_down["Objectivity"] = df_down.body.apply(subjectivity)

In [None]:
df_tot = pd.concat([df_up,df_down])

In [None]:
print("%d percent of the up-scored and %d of the down-voted posts are positive ." %((df_up.sentiment.sum()/1000)*100,
      (df_down.sentiment.sum()/1000 )*100))

In [None]:
print("""The t-test's p-value of %s, shows that there is NOT a statistically significant difference between the,
      scores of the up-voted and down-voted comments in terms of positivity""" \
      %(scipy.stats.ttest_ind(df_up.sentiment,df_down.sentiment)[1]))

In [None]:
print("%d percent of the up-scored and %d percent of the down-voted posts are Subjective ." %((df_up.Objectivity.sum()/1000)*100,
      (df_down.Objectivity.sum()/1000 )*100))

In [None]:
print("""The t-test's p-value of %s, shows that there is a statistically significant difference between the,
      scores of the up-voted and down-voted comments""" \
      %(scipy.stats.ttest_ind(df_up.Objectivity,df_down.Objectivity)[1]))

# What type of user's have the most up votes?

In [None]:
df = pd.read_sql("""SELECT author,score,body FROM May2015
                  WHERE author <> "[deleted]" ORDER BY score DESC LIMIT 5000""", sql_conn)

In [None]:
df["sentiment"] = df.body.apply(sentiment)
df["Objectivity"] = df.body.apply(subjectivity)

In [None]:
df_1 = pd.DataFrame(df.groupby('author')['sentiment'].sum()).reset_index()
#df_1 = df_1[df_1['sentiment'] > 7].reset_index(drop=True)

In [None]:
df_1.sort_values(by='sentiment',ascending=False).head(15).set_index('author').plot.barh()

In [None]:
cnt = collections.Counter(df.author)

In [None]:
top_20 = [i for i,j in cnt.most_common(20)]
    
    

In [None]:
df_top20 = df[df["author"].isin(top_20)]

In [None]:
df_top20.score.plot.hist(10,color='orange')

In [None]:
word_soup = ""
for i in df_top20.body:
    word_soup += " " + i


stop_words = set(stopwords.words('english'))

words = list(nltk.word_tokenize(word_soup))

stopwordsfree_words = [i for i in words if ((i not in stop_words) and (len(i)> 3))]
cnt = collections.Counter(stopwordsfree_words)
cnt.most_common(20)

In [None]:
pd.read_sql("""SELECT created_utc,author,score,subreddit, body,parent_id FROM May2015
                 ORDER BY random() LIMIT 10""", sql_conn)

In [4]:
df_rnd = sql_conn.execute("SELECT created_utc,author,score,subreddit, body,parent_id FROM May2015 ORDER BY random() LIMIT 1000")

In [8]:
df_rnd = pd.read_sql("""SELECT created_utc,author,score,subreddit, body,parent_id FROM May2015 
ORDER BY random() LIMIT 1000""",sql_conn)

In [9]:
df_rnd

Unnamed: 0,created_utc,author,score,subreddit,body,parent_id
0,1431992681,tenth_,2,Philippines,\n“Metaphors are dangerous. Love begins with a...,t3_36f1zg
1,1431438574,Willakarra,1,TagProTesting,Allllright! You can edit your main post and pu...,t1_cr6j743
2,1431998023,Rand0mtask,5,starcitizen,Yeah. I just think it's important to allow peo...,t1_crdkflb
3,1431782778,imjustlikeme,1,AskReddit,Georgia in my ass XP,t3_365n7r
4,1430735509,El_Golem215,2,GlobalOffensive,\&gt;skillful \n\&gt;pre-shooting driveby sty...,t1_cqxjdj6
5,1430776649,kalu0805,2,Teachers,Are you in a location where it is common for s...,t3_34vavh
6,1432251611,daaanish,1,AskReddit,"That's what I got, too. Went back to check if ...",t1_crgvybx
7,1432646843,[deleted],-2,sweden,[deleted],t1_crl9ddh
8,1430699826,[deleted],1,trees,[deleted],t1_cqxccu0
9,1431804912,Pi_Maker,2,Random_Acts_Of_Amazon,"I mean, we have it for the coding class to lea...",t1_crba0bs
