# Database preparation

#### Libraries

In [1]:
import sqlite3
import praw
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from functools import reduce

## 1. Preparing data about posts

#### Preparing a list of post id

In [2]:
conn = sqlite3.connect('Data/database.db')
list_of_posts = pd.read_sql('''SELECT 
                                distinct post_id
                        FROM Reddit
                        WHERE month IN ("2016_07","2016_08","2016_09","2016_10")
                        GROUP BY post_id
                        ORDER BY  max(score) desc
                        LIMIT 300''', conn)
list_of_posts=list_of_posts.values.tolist()
len(list_of_posts)

300

#### Connect to reddit

In [3]:
reddit = praw.Reddit(client_id='pG_dvbbdt151Rw', \
                     client_secret='KNp35NWOZ7FvjyF5haiPUAKSd8o', \
                     user_agent='O_Auth_app', \
                     username='_Alex_shadow_', \
                     password='2424678Rise')

#### Making a dataframe for the posts

In [4]:
topics_dict = {"author": [],
               "body":[],
               "controversiality":[],
               "created_utc":[],
               "distinguished":[],
               "gilded": [],
               "id": [],
               "post_id": [],
               "parent_id": [],
               "score": [],
               "subreddit": [],
               "num_comments": []               
              }

for elements in list_of_posts:
    submission  = reddit.submission(id=str(elements).strip('[').strip(']').strip("'"))
    
    topics_dict["author"].append(submission.author)
    topics_dict["body"].append(submission.title+" "+submission.selftext)
    topics_dict["controversiality"].append(submission.over_18)
    topics_dict["created_utc"].append(submission.created_utc)
    topics_dict["distinguished"].append(submission.distinguished)
    topics_dict["gilded"].append(submission.gilded)
    topics_dict["id"].append(submission.id)
    topics_dict["post_id"].append(submission.name)
    topics_dict["parent_id"].append(submission.name)
    topics_dict["score"].append(submission.score)
    topics_dict["subreddit"].append(submission.subreddit)
    topics_dict["num_comments"].append(submission.num_comments)
    
df_post=pd.DataFrame.from_dict(topics_dict, orient='index').transpose()
df_post['type']="Post"
df_post["parent_id"]=df_post["parent_id"].str[3:]
df_post["post_id"]=df_post["post_id"].str[3:]
df_post.tail()

Unnamed: 0,author,body,controversiality,created_utc,distinguished,gilded,id,post_id,parent_id,score,subreddit,num_comments,type
295,HamsterSandwich,Tim Kaine just called out Donald Trump’s histo...,False,1471090000.0,,0,4xixxo,t3_4xixxo,t3_4xixxo,6416,politics,1798,Post
296,georgiapeanuts,Bernie Sanders Is More Popular Than Ever,False,1473790000.0,,0,52lpmv,t3_52lpmv,t3_52lpmv,15943,politics,2774,Post
297,Chronoallusion,Obama calls for paid sick leave,False,1475500000.0,,0,55np09,t3_55np09,t3_55np09,8859,politics,1647,Post
298,Niematego,Vet who lost leg in Afghanistan is raising mon...,False,1470480000.0,,2,4wfk4q,t3_4wfk4q,t3_4wfk4q,17839,politics,2357,Post
299,tomhanks23,Trump says his insulting remarks towards women...,False,1475750000.0,,0,564z9u,t3_564z9u,t3_564z9u,7186,politics,2000,Post


## 2. Preparing data about comments

In [43]:
df_comments = pd.read_sql('''SELECT
                                author,
                                body,
                                controversiality,
                                created_utc,
                                distinguished,
                                gilded,
                                id,
                                post_id,
                                parent_id,
                                score,
                                subreddit,
                                month

                             FROM Reddit
                             WHERE post_id in (SELECT 
                                                    distinct post_id
                                                FROM Reddit
                                                WHERE month IN ("2016_07","2016_08","2016_09","2016_10")
                                                GROUP BY post_id
                                                ORDER BY  max(score) desc
                                                LIMIT 300)                             
                             ''', conn)
df_comments['type']="Comments"
df_comments

Unnamed: 0,author,body,controversiality,created_utc,distinguished,gilded,id,post_id,parent_id,score,subreddit,month,type
0,ChanceTheDog,I admit I liked Bernie and I think Trump V Ber...,0,1475280012,,0,d88zqga,557e83,d88zead,1,politics,2016_10,Comments
1,gameoverman1983,&gt; If Lester Holt could outsmart Donald Trum...,0,1475280043,,0,d88zr8e,554a80,d88zjd2,1,politics,2016_10,Comments
2,Borigrad,Trump isn't a misogynist he's just as an assho...,0,1475280122,,0,d88zt46,557axf,d88zi7h,1,politics,2016_10,Comments
3,Lord_Kek,It's only because I can't read Spanish. Funny ...,0,1475280183,,0,d88zuip,557axf,d88p8hr,1,politics,2016_10,Comments
4,Cleon_The_Athenian,"Military Industrial Complex, the wars must go ...",1,1475280209,,0,d88zv64,557e83,d88yin6,0,politics,2016_10,Comments
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1300844,BadPumpkin87,So you must have been outraged that the RNC tr...,0,1470009571,,0,d5yxr9b,4vi9n7,d5yun25,3,politics,2016_07,Comments
1300845,saturninus,The Dems tried bring up Bush's war record and ...,0,1470009573,,0,d5yxrah,4vi9n7,d5yvwuj,46,politics,2016_07,Comments
1300846,throwawayhere321,I am willing to bet you are anti-feminist. Lov...,0,1470009575,,0,d5yxrcf,4vi9n7,d5ylz5z,3,politics,2016_07,Comments
1300847,Semperi95,"Nah, he just wants to demonize and discriminat...",1,1470009586,,0,d5yxrky,4vi9n7,d5yxb0c,12,politics,2016_07,Comments


## 3. Merging data

In [67]:
# merging
df_all = pd.concat([df_comments, df_post], join='outer', sort=False)
# cleaning all NaN
df_all = df_all.replace({pd.np.nan: None})
df_all = df_all.reset_index(drop=True)
df_all

Unnamed: 0,author,body,controversiality,created_utc,distinguished,gilded,id,post_id,parent_id,score,subreddit,month,type,num_comments
0,ChanceTheDog,I admit I liked Bernie and I think Trump V Ber...,0,1.475280e+09,,0,d88zqga,557e83,d88zead,1,politics,2016_10,Comments,
1,gameoverman1983,&gt; If Lester Holt could outsmart Donald Trum...,0,1.475280e+09,,0,d88zr8e,554a80,d88zjd2,1,politics,2016_10,Comments,
2,Borigrad,Trump isn't a misogynist he's just as an assho...,0,1.475280e+09,,0,d88zt46,557axf,d88zi7h,1,politics,2016_10,Comments,
3,Lord_Kek,It's only because I can't read Spanish. Funny ...,0,1.475280e+09,,0,d88zuip,557axf,d88p8hr,1,politics,2016_10,Comments,
4,Cleon_The_Athenian,"Military Industrial Complex, the wars must go ...",1,1.475280e+09,,0,d88zv64,557e83,d88yin6,0,politics,2016_10,Comments,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1301144,HamsterSandwich,Tim Kaine just called out Donald Trump’s histo...,False,1.471093e+09,,0,4xixxo,4xixxo,4xixxo,6416,politics,,Post,1798.0
1301145,georgiapeanuts,Bernie Sanders Is More Popular Than Ever,False,1.473788e+09,,0,52lpmv,52lpmv,52lpmv,15943,politics,,Post,2774.0
1301146,Chronoallusion,Obama calls for paid sick leave,False,1.475502e+09,,0,55np09,55np09,55np09,8859,politics,,Post,1647.0
1301147,Niematego,Vet who lost leg in Afghanistan is raising mon...,False,1.470483e+09,,2,4wfk4q,4wfk4q,4wfk4q,17839,politics,,Post,2357.0


In [68]:
df_all.to_csv('post.csv', index=False)

## 4. Creating additional variables

In [66]:
from datetime import datetime
df_list= pd.DataFrame({"level":[],
                         "id":[],
                         "Betweenness_Centrality":[],
                         "Closeness_Centrality":[],
                         "PageRank":[],
                         "num_comments":[],
                         "month":[]})



for elements in list_of_posts:
    
    #-----------------------  
    # current date and time
    time_0 = datetime.now()
    post_count=1
    #-----------------------
    
    # filter dataset    
    df_source=df_all.loc[df_all["post_id"].values == elements]
    # Undirected graph
    UG = nx.from_pandas_edgelist(df=df_source, source="id", target="parent_id", create_using=nx.MultiGraph()) 
    # 1. Calculate level depth
    depth_levels_from_root = nx.single_source_shortest_path_length(UG, source=elements)
    level_index = pd.DataFrame(depth_levels_from_root, index=[0]).transpose().rename(columns={0: "level"})
    level_index["id"]=level_index.index
    level_index = level_index.sort_values(by=['id'], ascending=False)
    
    #-----------------------  
    # current date and time
    time_1 = datetime.now()
    #-----------------------
        
    # 2. Calculate betweeness centrality
    betweenness_centrality = nx.betweenness_centrality(UG, k=None, normalized=False, weight=None, endpoints=True, seed=None)
    bc = pd.DataFrame(list(betweenness_centrality.items()), columns=['id', 'Betweenness_Centrality'])
    bc = bc.sort_values(by=['id'], ascending=False)
    
    #-----------------------  
    # current date and time
    time_2 = datetime.now()
    #-----------------------
    
    # 3. Calculate closeness centrality
    closeness_centrality = nx.closeness_centrality(UG, u=None, distance=None, wf_improved=True)
    cc = pd.DataFrame(list(closeness_centrality.items()), columns=['id', 'Closeness_Centrality'])
    cc = cc.sort_values(by=['id'], ascending=False)

    #-----------------------  
    # current date and time
    time_3 = datetime.now()
    #-----------------------
    
    # Directed graph
    DG = nx.from_pandas_edgelist(df=df_source, source="id", target="parent_id", create_using=nx.DiGraph())
    # 4. Capculate page rank
    pr = nx.pagerank(DG, alpha=0.9)
    pr = pd.DataFrame(list(pr.items()), columns=['id', 'PageRank'])
    pr = pr.sort_values(by=['id'], ascending=False)    
    
    #-----------------------  
    # current date and time
    time_4 = datetime.now()
    #-----------------------
    
    # 5. Merge together     
    df_merged = level_index.merge(bc,on='id').merge(cc,on='id').merge(pr,on='id')
    # 6. Assign num of comments
    df_merged["num_comments"]=df_source.loc[df_source['type'] == 'Post', "num_comments"].values[0]
    df_merged["month"]=df_source['month'].iloc[10]
    
    df_list.append(df_merged)
    
    #-----------------------
    del df_source
    del UG
    del depth_levels_from_root
    del level_index
    del betweenness_centrality
    del bc
    del closeness_centrality
    del cc
    del DG
    del pr
    del df_merged
    #-----------------------
    # current date and time
    time_5 = datetime.now()
    #Report info:
    print("Post number: ",post_count)
    print("Step 1: ", time_1-time_0)
    print("Step 2: ", time_2-time_1)
    print("Step 3: ", time_3-time_2)
    print("Step 4: ", time_4-time_3)
    print("Step 5,6 : ", time_5-time_4)
    post_count = post_count+1
    
df_all=df_all.drop(columns=['num_comments', 'month'])
df_final=pd.merge(df_all, df_list, on='id')  
df_final

Post number:  1
Step 1:  0:00:00.222401
Step 2:  0:00:19.523220
Step 3:  0:00:15.932308
Step 4:  0:00:00.156255
Step 5,6 :  0:00:00.015637
Post number:  1
Step 1:  0:00:00.234020
Step 2:  0:00:33.376792
Step 3:  0:00:29.100345
Step 4:  0:00:00.203120
Step 5,6 :  0:00:00.015640
Post number:  1
Step 1:  0:00:00.908223
Step 2:  0:04:30.013444
Step 3:  0:03:17.663931
Step 4:  0:00:00.430912
Step 5,6 :  0:00:00.031264


KeyboardInterrupt: 

In [5]:
import inspect
inspect.getsource(nx.pagerank)

NameError: name 'nx' is not defined

In [None]:
# Checking for nulls
#df_final[df_final['num_comments'].isnull()]

# Checking for duplicates
#df_final[df_final.duplicated(['id'])]

Saving final database

In [None]:
conn = sqlite3.connect('Data/Sample.db')
df_final.to_sql('Reddit', conn, if_exists='replace', index=False)

Check

In [None]:
conn = sqlite3.connect('Data/Sample.db')
df_test = pd.read_sql("SELECT * FROM Reddit", conn)
df_test