# Reddit submissions graph analysis
## Step #1: Data preparation

#### Convert dataset to the proper Graph structure 

[What’s in a name? Understanding the Interplay betweenTitles, Content, and Communities in Social Media](http://i.stanford.edu/~julian/pdfs/icwsm13.pdf)

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn


import warnings
warnings.filterwarnings('ignore')

In [2]:
import igraph
print (igraph.__version__)

0.9.9


In [3]:
# !pip install igraph

### Load raw data

In [4]:
data_foled = 'data/'
data_file = 'redditSubmissions.csv'

In [5]:
df = pd.read_csv(data_foled+data_file, header=0, error_bad_lines=False)

b'Skipping line 67828: expected 13 fields, saw 16\nSkipping line 67829: expected 13 fields, saw 16\nSkipping line 67830: expected 13 fields, saw 16\nSkipping line 67831: expected 13 fields, saw 16\nSkipping line 67832: expected 13 fields, saw 14\n'


In [6]:
df.shape

(132303, 13)

In [7]:
df.head(10)

Unnamed: 0,#image_id,unixtime,rawtime,title,total_votes,reddit_id,number_of_upvotes,subreddit,number_of_downvotes,localtime,score,number_of_comments,username
0,0,1333172000.0,2012-03-31T12:40:39.590113-07:00,And here's a downvote.,63470.0,rmqjs,32657.0,funny,30813.0,1333198000.0,1844.0,622.0,Animates_Everything
1,0,1333178000.0,2012-03-31T14:16:01.093638-07:00,Expectation,35.0,rmun4,29.0,GifSound,6.0,1333203000.0,23.0,3.0,Gangsta_Raper
2,0,1333200000.0,2012-03-31T20:18:33.192906-07:00,Downvote,41.0,rna86,32.0,GifSound,9.0,1333225000.0,23.0,0.0,Gangsta_Raper
3,0,1333252000.0,2012-04-01T10:52:10-07:00,Every time I downvote something,10.0,ro7e4,6.0,GifSound,4.0,1333278000.0,2.0,0.0,Gangsta_Raper
4,0,1333273000.0,2012-04-01T16:35:54.393381-07:00,Downvote &quot;Dies Irae&quot;,65.0,rooof,57.0,GifSound,8.0,1333298000.0,49.0,0.0,Gangsta_Raper
5,0,1333761000.0,2012-04-07T08:11:00-07:00,"Demolished, every time you downvote someone",40.0,rxwjg,17.0,gifs,23.0,1333786000.0,-6.0,3.0,Hellothereawesome
6,0,1335504000.0,2012-04-27T12:17:14.103167-07:00,how i feel whenever i submit here,104.0,svpq7,67.0,fffffffuuuuuuuuuuuu,37.0,1335529000.0,30.0,12.0,
7,0,1339160000.0,2012-06-08T19:54:35.421944-07:00,getting that first downvote on a new post,13.0,usmxn,5.0,funny,8.0,1339185000.0,-3.0,0.0,
8,0,1339408000.0,2012-06-11T16:44:39.947798-07:00,How reddit seems to reacts whenever I share a ...,14.0,uwzrd,6.0,funny,8.0,1339433000.0,-2.0,0.0,
9,0,1339425000.0,2012-06-11T21:34:51.692933-07:00,Every LastAirBender post with a NSFW tag,20.0,uxf5q,9.0,pics,11.0,1339450000.0,-2.0,0.0,HadManySons


In [8]:
graph = df[["#image_id","subreddit"]]

In [9]:
graph.head(10)

Unnamed: 0,#image_id,subreddit
0,0,funny
1,0,GifSound
2,0,GifSound
3,0,GifSound
4,0,GifSound
5,0,gifs
6,0,fffffffuuuuuuuuuuuu
7,0,funny
8,0,funny
9,0,pics


In [10]:
graph["subreddit2"] = pd.Categorical(graph["subreddit"]).codes

In [11]:
graph.head()

Unnamed: 0,#image_id,subreddit,subreddit2
0,0,funny,539
1,0,GifSound,153
2,0,GifSound,153
3,0,GifSound,153
4,0,GifSound,153


In [12]:
graph.shape

(132303, 3)

### Aggregate data

In [13]:
subreddit_dict = graph.copy()

subreddit_dict['count'] = 1
subreddit_dict = subreddit_dict.groupby(['subreddit','subreddit2']).agg({'count':'sum'}).sort_values(by='count', ascending=False)

In [14]:
subreddit_dict.reset_index(inplace=True)
subreddit_dict = subreddit_dict.rename(index=str, columns={"subreddit": "name", "subreddit2": "id", "count":"count"})

### Save Nodes<->Subreddit dictionary

In [15]:
subreddit_dict.head(10)

Unnamed: 0,name,id,count
0,funny,539,55277
1,pics,694,24712
2,gifs,557,12538
3,WTF,379,12193
4,aww,407,5920
5,GifSound,153,5608
6,atheism,402,3366
7,gaming,545,3009
8,AdviceAnimals,13,999
9,reactiongifs,723,963


In [16]:
subreddit_dict.to_csv('./data/subreddit_dict.csv', index=None)

### Transform Table to Graph

In [17]:
graph = graph.rename(index=str, columns={"#image_id": "img", "subreddit": "subreddit_name", "subreddit2": "subreddit"})

In [18]:
graph.shape

(132303, 3)

In [19]:
graph.drop_duplicates(inplace=True)

In [20]:
graph.shape

(42568, 3)

In [21]:
tmp = graph.join(graph.set_index('img'),on='img',rsuffix='_neighbor')

In [22]:
tmp.head(10)

Unnamed: 0,img,subreddit_name,subreddit,subreddit_name_neighbor,subreddit_neighbor
0,0,funny,539,funny,539
0,0,funny,539,GifSound,153
0,0,funny,539,gifs,557
0,0,funny,539,fffffffuuuuuuuuuuuu,528
0,0,funny,539,pics,694
0,0,funny,539,atheism,402
0,0,funny,539,gaming,545
1,0,GifSound,153,funny,539
1,0,GifSound,153,GifSound,153
1,0,GifSound,153,gifs,557


In [23]:
tmp.shape

(141082, 5)

In [24]:
tmp = tmp[tmp['subreddit']!=tmp['subreddit_neighbor']]

In [25]:
tmp.shape

(98514, 5)

In [26]:
graph[graph['img']==0]

Unnamed: 0,img,subreddit_name,subreddit
0,0,funny,539
1,0,GifSound,153
5,0,gifs,557
6,0,fffffffuuuuuuuuuuuu,528
9,0,pics,694
13,0,atheism,402
16,0,gaming,545


In [27]:
tmp.sort_values(by='subreddit',inplace=True)

In [28]:
tmp["ukey1"] = tmp["subreddit"].astype(str)+'_'+tmp["subreddit_neighbor"].astype(str)
tmp["ukey2"] = tmp["subreddit_neighbor"].astype(str)+'_'+tmp["subreddit"].astype(str)

tmp = tmp[(tmp['subreddit']<tmp['subreddit_neighbor'])]

In [29]:
tmp.head(10)

Unnamed: 0,img,subreddit_name,subreddit,subreddit_name_neighbor,subreddit_neighbor,ukey1,ukey2
53033,20089,,-1,pics,694,-1_694,694_-1
13946,1233,2006Scape,0,gaming,545,0_545,545_0
13946,1233,2006Scape,0,pics,694,0_694,694_0
13946,1233,2006Scape,0,roosterteeth,734,0_734,734_0
13946,1233,2006Scape,0,fitnesscirclejerk,531,0_531,531_0
13946,1233,2006Scape,0,gifs,557,0_557,557_0
13946,1233,2006Scape,0,GifSound,153,0_153,153_0
13946,1233,2006Scape,0,reactiongifs,723,0_723,723_0
13946,1233,2006Scape,0,reddit.com,724,0_724,724_0
13946,1233,2006Scape,0,funny,539,0_539,539_0


In [30]:
tmp.shape

(49257, 7)

In [31]:
tmp.drop_duplicates(subset=['subreddit','subreddit_neighbor'],inplace=True)

In [32]:
result = tmp.drop(labels=['img','subreddit_name','subreddit_name_neighbor','ukey1','ukey2'], axis=True)

In [33]:
result.head(10)

Unnamed: 0,subreddit,subreddit_neighbor
53033,-1,694
13946,0,545
13946,0,694
13946,0,734
13946,0,531
13946,0,557
13946,0,153
13946,0,723
13946,0,724
13946,0,539


In [34]:
result.shape

(5321, 2)

In [35]:
result.to_csv('./data/network.csv', index=None)

## Graph structure validation

In [36]:
graph[graph["subreddit"]==718]

Unnamed: 0,img,subreddit_name,subreddit
67230,23320,ragefaces,718


In [37]:
graph[graph["img"]==23320]

Unnamed: 0,img,subreddit_name,subreddit
67230,23320,ragefaces,718
67231,23320,moosedongs,644
67232,23320,funny,539


In [38]:
result[(result["subreddit"]==718)|(result["subreddit_neighbor"]==718)]

Unnamed: 0,subreddit,subreddit_neighbor
67232,539,718
67231,644,718


In [39]:
result[(result["subreddit"]==644)|(result["subreddit_neighbor"]==644)]

Unnamed: 0,subreddit,subreddit_neighbor
67232,539,644
67231,644,718
