## Tasks:

- Import comment/reply data
- Determine whether each comment contains "immigrant"/"immigrat" or not
- Determine whether each comment is a top-level or second-level comment, and store second-level comments with their corresponding top-level comments
- Show statistics
- Determine whether each comment was made by a Trump supporter, Bernie/Hillary supporter, or neither
- Show statistics

In [12]:
import json
import csv
import pandas as pd
import numpy as np

from IPython.display import display, HTML

## Preprocess comments

In [7]:
def remove_bad_comments(chunk):
    chunk_notna = chunk.loc[chunk['body'].notna()]
    comments = chunk_notna['body'].values
    
    good_indices = [j for j in range(len(comments)) if comments[j] != '[deleted]' 
                    and comments[j] != '[removed]'
                    and 'thank you for participating' not in comments[j].lower()]
        
    return (chunk_notna.iloc[good_indices])

In [2]:
# Contains "immigrant" or other forms of the word
def contains_immigrant_narrow(txt):
    txt_lower = txt.lower()
    return "immigrant" in txt_lower or "immigrat" in txt_lower

In [8]:
chunk_size = 5 * (10 ** 5)
iter_num = 0
comments_filename = "../sample_data/politics_comments_2015_2016_2.csv"
out_filename = "stored_variables/contains_immigrant_comments.csv"

for chunk in pd.read_csv(comments_filename, 
                         header=None, 
                         names = ['author', 'subreddit', 'body', 'score', 'created_dt', 'link_id', 'parent_id', 'id'],
                         usecols = ['author', 'body', 'link_id', 'parent_id', 'id'],
                         index_col = False,
                         chunksize = chunk_size
                         ):
    iter_num += 1
    chunk_valid = remove_bad_comments(chunk)
    comment_text = chunk_valid['body'].values
    contains_immigrant_col = [contains_immigrant_narrow(t) for t in comment_text]
    chunk_contains = chunk_valid.drop(columns=['body'])
    chunk_contains['contains_imm'] = contains_immigrant_col
    
    file1 = open(out_filename, 'a')
    file1.write(chunk_contains.to_csv(header=False, index=False))
    file1.close()
    
    print(iter_num * chunk_size)

500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000
7500000
8000000
8500000
9000000
9500000
10000000
10500000
11000000
11500000
12000000
12500000
13000000
13500000
14000000
14500000
15000000
15500000
16000000
16500000
17000000
17500000
18000000
18500000
19000000
19500000
20000000
20500000
21000000
21500000
22000000
22500000
23000000
23500000


## Preprocess user groups 

(Trump supporters, Sanders supporters, etc)

In [77]:
partisan_data = pd.read_csv("../sample_data/politics_auth_partisan.csv", 
                            header=None, 
                            names = ['author', 'num_trump', 'num_sanders', 'num_clinton'],
                            index_col = False)

In [78]:
df_authors = partisan_data['author'].values
df_trump = partisan_data['num_trump'].values
df_sanders = partisan_data['num_sanders'].values
df_clinton = partisan_data['num_clinton'].values

num_sanders = {df_authors[i]:df_sanders[i] for i in range(len(df_authors))}
num_trump = {df_authors[i]:df_trump[i] for i in range(len(df_authors))}
num_clinton = {df_authors[i]:df_clinton[i] for i in range(len(df_authors))}

In [89]:
comment_threshold = 5

sanders_user_indices = [i for i in range(len(df_authors)) if num_sanders[df_authors[i]] >= comment_threshold
                                 and num_trump[df_authors[i]] == 0 and num_clinton[df_authors[i]] == 0]
trump_user_indices = [i for i in range(len(df_authors)) if num_trump[df_authors[i]] >= comment_threshold
                               and num_sanders[df_authors[i]] == 0 and num_clinton[df_authors[i]] == 0]
clinton_user_indices = [i for i in range(len(df_authors)) if num_clinton[df_authors[i]] >= comment_threshold
                               and num_sanders[df_authors[i]] == 0 and num_trump[df_authors[i]] == 0]
none_user_indices = [i for i in range(len(df_authors)) if num_sanders[df_authors[i]] == 0 
                              and num_trump[df_authors[i]]  == 0 and num_clinton[df_authors[i]] == 0]

In [91]:
trump_users = set([df_authors[i] for i in trump_user_indices])
sanders_users = set([df_authors[i] for i in sanders_user_indices])

## Structure by comments and replies

In [21]:
toplevel_comments = None

In [22]:
chunk_size = 5 * (10 ** 6)
iter_num = 0
preprocessed_filename = "stored_variables/contains_immigrant_comments.csv"

for chunk in pd.read_csv(preprocessed_filename, 
                         header=None, 
                         names = ['author', 'link_id', 'parent_id', 'id', 'contains_imm'],
                         index_col = False,
                         chunksize = chunk_size
                         ):
    iter_num += 1
    
    chunk_toplevel = chunk.loc[chunk['link_id'] == chunk['parent_id']]
    toplevel_comments = pd.concat([toplevel_comments, chunk_toplevel])
    
    print(iter_num * chunk_size)

5000000
10000000
15000000
20000000
25000000


In [26]:
display(toplevel_comments)
print(toplevel_comments['contains_imm'].values.sum())

Unnamed: 0,author,link_id,parent_id,id,contains_imm
1,MrXhin,t3_2xf5nu,t3_2xf5nu,cp0cr4a,False
8,bilsonM,t3_2xhrex,t3_2xhrex,cp0crhg,False
12,grewapair,t3_2xgj64,t3_2xgj64,cp0crwn,False
17,IronTek,t3_2xh4a8,t3_2xh4a8,cp0csga,False
18,JumpingJazzJam,t3_2xi5pn,t3_2xi5pn,cp0csiu,False
...,...,...,...,...,...
21566894,JumpingJazzJam,t3_2xid7j,t3_2xid7j,cp0cn9v,False
21566902,MrXhin,t3_2xhctk,t3_2xhctk,cp0co2r,False
21566905,rudieboy,t3_2xh4a8,t3_2xh4a8,cp0cp17,False
21566909,simjanes2k,t3_2xhrex,t3_2xhrex,cp0cpe3,False


24845


In [38]:
toplevel_ids = set(toplevel_comments['id'].values)

In [39]:
level2_comments = None

In [40]:
chunk_size = 5 * (10 ** 6)
iter_num = 0
preprocessed_filename = "stored_variables/contains_immigrant_comments.csv"

for chunk in pd.read_csv(preprocessed_filename, 
                         header=None, 
                         names = ['author', 'link_id', 'parent_id', 'id', 'contains_imm'],
                         index_col = False,
                         chunksize = chunk_size
                         ):
    iter_num += 1
    
    parent_ids = [s[3:] for s in chunk['parent_id'].values]
    is_level2 = [i in toplevel_ids for i in parent_ids]
    
    chunk_level2 = chunk.iloc[is_level2]
    level2_comments = pd.concat([level2_comments, chunk_level2])
    
    print(iter_num * chunk_size)

5000000
10000000
15000000
20000000
25000000


In [41]:
display(level2_comments)
print(level2_comments['contains_imm'].values.sum())

Unnamed: 0,author,link_id,parent_id,id,contains_imm
2,sickofthisshit,t3_2xiaj4,t1_cp0c2sz,cp0cr54,False
19,gtfooh1011,t3_2xid7j,t1_cp0ci37,cp0csp8,False
30,exixx,t3_2xh4a8,t1_cp0alm1,cp0cumz,False
33,pamthecowfarmier,t3_2xhrex,t1_cp0b5ao,cp0cuvz,False
47,kikowatzy,t3_2xhn52,t1_cp07r0e,cp0cwhf,False
...,...,...,...,...,...
21566889,kikowatzy,t3_2xhrex,t1_cp09gmy,cp0cmil,False
21566896,Temp237,t3_2xh4a8,t1_cp08uy6,cp0cne8,False
21566904,RockFourFour,t3_2xfe5d,t1_coztar4,cp0cows,False
21566910,rsc2,t3_2xhrex,t1_cp0baoz,cp0cpog,False


22992


In [42]:
level2_comments['parent_id_mod'] = [s[3:] for s in level2_comments['parent_id'].values]

In [43]:
toplevel_comments = toplevel_comments.drop(columns=['link_id', 'parent_id'])
level2_comments = level2_comments.drop(columns=['link_id','parent_id'])

In [45]:
display(toplevel_comments)
display(level2_comments)

Unnamed: 0,author,id,contains_imm
1,MrXhin,cp0cr4a,False
8,bilsonM,cp0crhg,False
12,grewapair,cp0crwn,False
17,IronTek,cp0csga,False
18,JumpingJazzJam,cp0csiu,False
...,...,...,...
21566894,JumpingJazzJam,cp0cn9v,False
21566902,MrXhin,cp0co2r,False
21566905,rudieboy,cp0cp17,False
21566909,simjanes2k,cp0cpe3,False


Unnamed: 0,author,id,contains_imm,parent_id_mod
2,sickofthisshit,cp0cr54,False,cp0c2sz
19,gtfooh1011,cp0csp8,False,cp0ci37
30,exixx,cp0cumz,False,cp0alm1
33,pamthecowfarmier,cp0cuvz,False,cp0b5ao
47,kikowatzy,cp0cwhf,False,cp07r0e
...,...,...,...,...
21566889,kikowatzy,cp0cmil,False,cp09gmy
21566896,Temp237,cp0cne8,False,cp08uy6
21566904,RockFourFour,cp0cows,False,coztar4
21566910,rsc2,cp0cpog,False,cp0baoz


In [47]:
toplevel_parent = toplevel_comments.rename(columns={'author':'parent_author', 
                                                    'id':'parents_id', 'contains_imm':'parent_contains_imm'})

In [50]:
matched_comments = \
toplevel_parent.merge(level2_comments, how='right', left_on='parents_id', right_on='parent_id_mod')

In [51]:
display(matched_comments)

Unnamed: 0,parent_author,parents_id,parent_contains_imm,author,id,contains_imm,parent_id_mod
0,bilsonM,cp0crhg,False,al_kohalik,cp0qro2,False,cp0crhg
1,bilsonM,cp0crhg,False,kikowatzy,cp0tl1h,False,cp0crhg
2,cpt_caveman,cp0cx5v,False,RealRepub,cp0jrhh,False,cp0cx5v
3,MrXhin,cp0cxy2,False,HalNavel,cp0krt4,False,cp0cxy2
4,MrXhin,cp0cxy2,False,Drewstom,cp0m19t,False,cp0cxy2
...,...,...,...,...,...,...,...
3554752,kode7,cp0cdzx,False,[deleted],cp0cluq,False,cp0cdzx
3554753,ben1204,cp0cf9y,False,whatnowdog,cp0d0vz,False,cp0cf9y
3554754,[deleted],cp0ci37,False,gtfooh1011,cp0csp8,False,cp0ci37
3554755,dangerjo,cp0clva,False,currentlydrinking,cp0exhe,False,cp0clva


In [93]:
# matched_comments.to_csv("stored_variables/matched_immigrant_comments.csv", header=False, index=False)

In [95]:
# matched_comments = pd.read_csv("stored_variables/matched_immigrant_comments.csv", 
#                          header=None, 
#                          names = ['parent_author', 'parents_id', 'parent_contains_imm', 
#                                   'author', 'id', 'contains_imm', 'parent_id_mod'],
#                          index_col = False
#                          )

In [66]:
np.sum(np.logical_and(np.logical_not(matched_comments['parent_contains_imm'].to_numpy()), 
               matched_comments['contains_imm'].to_numpy()))

16297

In [67]:
np.sum(np.logical_and(matched_comments['parent_contains_imm'].to_numpy(), 
               matched_comments['contains_imm'].to_numpy()))

6695

In [68]:
np.sum(np.logical_and(matched_comments['parent_contains_imm'].to_numpy(), 
               np.logical_not(matched_comments['contains_imm'].to_numpy())))

28253

In [69]:
np.sum(np.logical_and(np.logical_not(matched_comments['parent_contains_imm'].to_numpy()), 
               np.logical_not(matched_comments['contains_imm'].to_numpy())))

3503512

|| Parent contains 'immigrant' | Parent does not |
|---|---|---|
|Child contains 'immigrant'| 6695  | 16297  |
|Child does not| 28253  | 3503512  |


## Determine whether there is a difference between Trump supporters and others

In [74]:
print(np.sum(matched_comments['parent_author'].values == '[deleted]'))
print(np.sum(matched_comments['parent_author'].values == '[removed]'))
print(np.sum(matched_comments['author'].values == '[deleted]'))
print(np.sum(matched_comments['author'].values == '[removed]'))

67911
0
57124
0
