In [247]:
import numpy as np
import os
import pandas as pd
import seaborn as sns

# Load dataset

In [35]:
def csv_file_list(root):
    """Return a fully-qualified list of csv filenames under root directory"""

    fullpaths = []
    for dirpath, dirname, filename in os.walk(root):
        for f in filename:
            if ".csv" in f:
                fullpaths.append(os.path.join(dirpath, f))
    return fullpaths

In [56]:
files = csv_file_list('/Users/haivule/Documents/USF/fall2/MSAN621/final_project/Data/')
files[:5]

['/Users/haivule/Documents/USF/fall2/MSAN621/final_project/Data/entertainment_anime.csv',
 '/Users/haivule/Documents/USF/fall2/MSAN621/final_project/Data/entertainment_comicbooks.csv',
 '/Users/haivule/Documents/USF/fall2/MSAN621/final_project/Data/entertainment_harrypotter.csv',
 '/Users/haivule/Documents/USF/fall2/MSAN621/final_project/Data/entertainment_movies.csv',
 '/Users/haivule/Documents/USF/fall2/MSAN621/final_project/Data/entertainment_music.csv']

The .csvs are named <metareddit>_<subreddit>.csv. 

Headers are:
```python
text,id,subreddit,meta,time,author,ups,downs,authorlinkkarma,authorkarma,authorisgold
```

In [166]:
headers = 'text,id,subreddit,meta,time,author,ups,downs,authorlinkkarma,authorkarma,authorisgold'.split(',')
headers

['text',
 'id',
 'subreddit',
 'meta',
 'time',
 'author',
 'ups',
 'downs',
 'authorlinkkarma',
 'authorkarma',
 'authorisgold']

In [42]:
df = pd.read_csv(files[0])
df.head(5)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,,0,1,2,3,4.0,5,6.0,7.0,8,9,10.0
1,1,0.0,because she s the worst,d02u69l,anime,entertainment,1455683000.0,Redire77,7.0,0.0,352,14017,0.0
2,2,1.0,i am referring to this http iimgurcom5sryl...,466ijy,anime,entertainment,1455683000.0,shiba_arata,0.0,0.0,1,20,0.0
3,3,2.0,cheating but zoldycks must have a great time a...,d02g879,anime,entertainment,1455661000.0,ShaKing807,6.0,0.0,1308,62021,1.0
4,4,3.0,kurosaki ichigo http images5fanpopcomimag...,d02v88z,anime,entertainment,1455685000.0,Tf2idlingftw,2.0,0.0,4156,1021,0.0


In [46]:
print(f"Size of the dataframe: {df.shape}")
print(f"Number of columns: {len('text,id,subreddit,meta,time,author,ups,downs,authorlinkkarma,authorkarma,authorisgold'.split(','))}")

Size of the dataframe: (90184, 13)
Number of columns: 11


__Note when loading datasets:__
The first row is redundant. So as the first 2 columsn. Therefore, when loading the dataset, we should remove these row and columns.


In [118]:
df2 = pd.read_csv(files[5], index_col=0)
# df2 = df2.iloc[1:, -11:]
df2.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,he died for our spins,d02mzs9,starwars,entertainment,1455672000.0,roccoshapiro,1.0,0.0,7303,2180,0.0
1,i do nt think he necessarily knows that has an...,d02qghe,starwars,entertainment,1455677000.0,p3wp3wkachu,4.0,0.0,1,242,0.0
2,both times i saw it it seemed to me that mean...,d02skyc,starwars,entertainment,1455680000.0,Friendly_B,1.0,0.0,234,2865,0.0
3,,465u08,starwars,entertainment,1455673000.0,ChickenStrips45,4.0,0.0,4013,3512,0.0
4,the tie fighters flying over was a nice touch ...,d02kdfr,starwars,entertainment,1455668000.0,ltsmiles,1.0,0.0,53952,20152,0.0


In [84]:
df.iloc[1:, -11:].head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11
1,because she s the worst,d02u69l,anime,entertainment,1455683000.0,Redire77,7.0,0.0,352,14017,0.0
2,i am referring to this http iimgurcom5sryl...,466ijy,anime,entertainment,1455683000.0,shiba_arata,0.0,0.0,1,20,0.0
3,cheating but zoldycks must have a great time a...,d02g879,anime,entertainment,1455661000.0,ShaKing807,6.0,0.0,1308,62021,1.0
4,kurosaki ichigo http images5fanpopcomimag...,d02v88z,anime,entertainment,1455685000.0,Tf2idlingftw,2.0,0.0,4156,1021,0.0
5,there are a shit ton of koutarous but the pre...,d02ggn7,anime,entertainment,1455662000.0,Angst-Incarnate,1.0,0.0,137,41859,0.0


In [114]:
df2.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
1,i do nt think he necessarily knows that has an...,d02qghe,starwars,entertainment,1455677000.0,p3wp3wkachu,4.0,0.0,1,242,0.0
2,both times i saw it it seemed to me that mean...,d02skyc,starwars,entertainment,1455680000.0,Friendly_B,1.0,0.0,234,2865,0.0
3,,465u08,starwars,entertainment,1455673000.0,ChickenStrips45,4.0,0.0,4013,3512,0.0
4,the tie fighters flying over was a nice touch ...,d02kdfr,starwars,entertainment,1455668000.0,ltsmiles,1.0,0.0,53952,20152,0.0
5,they sure did https youtubeqds6re3hgo,d01oq0x,starwars,entertainment,1455608000.0,cyborgcommando0,1.0,0.0,63181,66452,1.0


### Load and combine all files into one dataframe

In [238]:
def load_data(filelist):
    data_list = []
    for file in filelist:
        df = pd.read_csv(file, index_col=0, header=0)  # to move the redundant rows and columns
        df = df.iloc[:, -11:]
        df.columns = headers
        data_list.append(df)
    return pd.concat(data_list, ignore_index=True)  # ignore_index = True so that all the rows are re-indexed

In [239]:
reddit = load_data(files)

In [240]:
reddit.head(5)

Unnamed: 0,text,id,subreddit,meta,time,author,ups,downs,authorlinkkarma,authorkarma,authorisgold
0,0,1,2,3,4.0,5,6.0,7.0,8.0,9.0,10.0
1,because she s the worst,d02u69l,anime,entertainment,1455683000.0,Redire77,7.0,0.0,352.0,14017.0,0.0
2,i am referring to this http iimgurcom5sryl...,466ijy,anime,entertainment,1455683000.0,shiba_arata,0.0,0.0,1.0,20.0,0.0
3,cheating but zoldycks must have a great time a...,d02g879,anime,entertainment,1455661000.0,ShaKing807,6.0,0.0,1308.0,62021.0,1.0
4,kurosaki ichigo http images5fanpopcomimag...,d02v88z,anime,entertainment,1455685000.0,Tf2idlingftw,2.0,0.0,4156.0,1021.0,0.0


In [241]:
reddit.tail(5)

Unnamed: 0,text,id,subreddit,meta,time,author,ups,downs,authorlinkkarma,authorkarma,authorisgold
2663647,i literally started screaming when sam was eat...,d01qs43,thewalkingdead,television,1455616000.0,NorthWeapon,1.0,0.0,6.0,56.0,0.0
2663648,literally the only reason for ron s existence ...,d00stxn,thewalkingdead,television,1455556000.0,eightdaysdead,-16.0,0.0,1.0,6.0,0.0
2663649,never seen game of thrones eh the walking dea...,d00vzrc,thewalkingdead,television,1455561000.0,lumpy999,-37.0,0.0,1.0,7524.0,0.0
2663650,i thought that scene was boring and tame if th...,d00pn26,thewalkingdead,television,1455551000.0,icyhot39,-38.0,0.0,1.0,1532.0,0.0
2663651,deleted,d00g2sg,thewalkingdead,television,1455522000.0,[deleted],-10.0,0.0,0.0,0.0,


In [242]:
reddit.shape

(2663652, 11)

In [251]:
for comment in reddit.text.head(20):
    print(comment+'\n')

0

because she s the worst 

i am referring to  this   http  iimgurcom5srylmijpg  does it have any deeper meaning or does it signify anything  i just do nt get it why she d do that 

cheating but zoldycks must have a great time at thanksgiving 

 kurosaki ichigo    http  images5fanpopcomimagephotos29000000ichigowallpaperkurosakiichigo290694271024768jpg  and  kurosaki mea   http  staticzerochannetkurosakimeafull1689483jpg 

there are a shit ton of koutarous  but the presence of  one   https  smediacacheak0pinimgcomoriginals1219ed1219ed717fc2bfce372759bba2fe1cfegif  is enough to make it the most interesting party 

they went full free in the end 

by far the best episode of this show this was pure fun to watch from beginning to end 

gal stan might as well be team durarara with masaomi  mikado  and rochi

hi th3jester_  it seems like you might be looking for anime recommendations  the users of this subreddit came up with  an awesome recommendations flowchart   http  iimgurcomq9xjv4pjpg  

In [261]:
reddit2 = pd.read_csv('comments.csv', header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,,4644c4,libertarian,news,1455651000.0,DrWinters,3.0,0.0,12728.0,1641.0,0.0
1,,464kvf,libertarian,news,1455657000.0,unknownman19,20.0,0.0,50364.0,14436.0,0.0
2,,466hv9,libertarian,news,1455683000.0,hp_chabanais,1.0,0.0,178.0,3.0,0.0
3,,462wog,libertarian,news,1455636000.0,ghostofpennwast,18.0,0.0,231424.0,80539.0,1.0
4,disclaimer : i think obama should nominate som...,d028c5d,politics,news,1455651000.0,degausse,3.0,0.0,1.0,1941.0,0.0


In [262]:
headers2 = ['text', 'id', 'subreddit', 'meta', 'time', 'author', 'ups', 'downs', 'authorlinkkarma', 'authorcommentkarma', 'authorisgold']
reddit2.columns = headers2

In [259]:
reddit2.shape

(2726000, 10)

In [263]:
reddit2.head(5)

Unnamed: 0,text,id,subreddit,meta,time,author,ups,downs,authorlinkkarma,authorcommentkarma,authorisgold
0,,4644c4,libertarian,news,1455651000.0,DrWinters,3.0,0.0,12728.0,1641.0,0.0
1,,464kvf,libertarian,news,1455657000.0,unknownman19,20.0,0.0,50364.0,14436.0,0.0
2,,466hv9,libertarian,news,1455683000.0,hp_chabanais,1.0,0.0,178.0,3.0,0.0
3,,462wog,libertarian,news,1455636000.0,ghostofpennwast,18.0,0.0,231424.0,80539.0,1.0
4,disclaimer : i think obama should nominate som...,d028c5d,politics,news,1455651000.0,degausse,3.0,0.0,1.0,1941.0,0.0


In [267]:
for text in reddit2.text.head(20):
    print(f"{text}\n")

nan

nan

nan

nan

disclaimer : i think obama should nominate someone , and the senate should hold confirmation hearings. but this article does not accurately describe most republicans ' statements on this issue . the vast majority of republican statements ( although , notably , not the cruz one cited in the article ) carefully phrase the issue as whether a nominee has been " nominated and confirmed " in the president 's last year in office . although kennedy was confirmed in 1988 , he was nominated by reagan at the end of 1987 , so he does n't fall under this definition .

nan

either way the process will be dragged out until after the elections. the gop have the votes in the senate to say no and would rather gamble in 2016. they at least had the decency to inform everyone of what exactly they 're doing this time around .

republicans have always battled with severe cases of selective amnesia .

politics were so different back then. people on both sides had to at least pretend to be 