In [21]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import re
import networkx as nx
from tqdm.notebook import tqdm
tqdm.pandas()
import os
import pickle

# Import the bot list retrieved online

In [2]:
bot_list = pd.read_csv('/Users/victor/Desktop/CS/UST_summer/bot_list.csv')
bot_list['Username'] = bot_list.Username.apply(lambda x: x[3:])

# Import the top spammers who are considered as bots

In [3]:
popular_author = pd.read_csv('/Users/victor/Desktop/CS/UST_summer/10k_sub_final.csv')

popular_bot = popular_author[popular_author.is_bot == 1]
popular_bot.head()

Unnamed: 0,author,is_bot,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,AutoModerator,1,,,,,
25,VisualMod,1,,,,,
27,___alexa___,1,,,,,
32,dwdsquared2,1,,,,,


In [4]:
def advanced_clean(name):
    
    tokens = re.findall(r'[A-Z](?:[A-Z]*(?![a-z])|[a-z]*)', name)
    tokens = list(map(lambda x: x.lower(), tokens))
    
    tokens = list (map(lambda x: re.split('[^a-zA-Z]',x), tokens))
    tokens = [item for sublist in tokens for item in sublist]
    
    if 'not' in tokens and 'bot' in tokens:
        return False

    elif 'bot' in tokens:
        return True
    
    return False

def create_name_id_dict(dataframe):    #create id:author dataframe
    name_id = {}
    for i in range(dataframe.shape[0]):
        key = dataframe.id.iloc[i]
        value = dataframe.author.iloc[i]

        key_value_pair = {key:value}

        name_id.update(key_value_pair)
    return name_id

def clean_by_parent(dataframe, name_id):
    """
    A function that is used to clean any identified BOT from the parent_id in a dataframe
    Args:
        dataframe (pandas.dataframe): the dataframe to be exmained
        name_id (dictionary): a dictionary having the id as the key, its corresponding name as value

    Returns:
        dataframe: the dataframe that is cleaned by parent_id
    """
    drop_indices = []

    for index, row in dataframe.iterrows():
        uid = str(row['parent_id']).split('_')[-1]
        # use the dictionary to find the corresponding author name
        try:
            author_name = name_id[uid]
        except:
            # this would happen if the poster created the post in posterior months
            continue
        if advanced_clean(author_name):
            drop_indices.append(i)
    
    dataframe = dataframe.drop(drop_indices)
    
    return dataframe.reset_index(drop=True)

def full_clean (dataframe):
    # shallow clean
    dataframe = dataframe[dataframe.author != '[deleted]']
    dataframe = dataframe[dataframe.author != 'AutoModerator']
    dataframe = dataframe[dataframe.author != 'VisualMod']
    
    # deep clean 
    dataframe['is_bot']  = dataframe['author'].progress_apply(advanced_clean)

    dataframe = dataframe[dataframe.is_bot == False]
    
    # further clean
    dataframe['is_bot'] = dataframe.author.isin(bot_list.Username)

    dataframe = dataframe[dataframe.is_bot == False]
    
    dataframe['is_bot'] = dataframe.author.isin(popular_bot.author)

    dataframe = dataframe[dataframe.is_bot == False]
    
    # clean by parent_id
    dataframe = clean_by_parent(dataframe, create_name_id_dict(dataframe))
    
    return dataframe

# Create daily subdataframe

In [5]:
def workout_time(year, month):
  """
  A function that works out the timestamps of each day in a month
  Args:
      year (int): the year
      month (int): the month

  Returns:
      day_list: the timestamp of each day in a month
  """
  max_day = 31
  month_list = [4, 6, 9, 11]

  if year == 2020 and month == 2:
    max_day = 29
  elif month == 2:
    max_day = 28
  elif month in month_list:
    max_day = 30
  
  start_date = datetime(year, month, 1, 0) # always start at the first day on each month

  day_list = []

  for i in range(max_day):
    count_time = start_date + relativedelta(days=i)
    count_time = count_time.timestamp()
    day_list.append(count_time)
  
  return day_list

In [6]:
def create_subframe(year, month, dataframe):
  """
  A funtion that creates sub_dataframes from the parent dataframe based on number of days
  Args:
      year (int): the year
      month (int): the month
      dataframe(pandas.dataframe): the MONTH dataframe we are examine

  Returns:
      df_list: a list of DAY dataframe
  """
  df_list = []

  day_list = workout_time(year, month)

  for i in range(len(day_list)-1):
    left = day_list[i]
    right = day_list[i+1]

    day_df = dataframe[(dataframe.created_utc >= left) & (dataframe.created_utc < right)]
    
    df_list.append(day_df)
  
  return df_list

In [7]:
def create_t3_name_id_dict(dataframe):    #create id:author dataframe
    name_id = {}
    for i in range(dataframe.shape[0]):
        key = dataframe.id.iloc[i]
        value = dataframe.author.iloc[i]

        key_value_pair = {key:value}

        name_id.update(key_value_pair)
    return name_id


In [8]:
def merge_df(comment_df, post_df):
    col_zero = np.zeros(post_df.shape[0])
    
    post_df['parent_id'] = pd.Series(col_zero)
    post_df['link_id'] = pd.Series(col_zero)
    concat_df = pd.concat([comment_df, post_df])
    return concat_df

# For t3_df:

In [9]:
def create_level_1(dataframe):     #create the output dataframe
    comments = dataframe[dataframe['parent_id'] == dataframe['link_id']]
    level_1_comments = comments[comments['parent_id'] != 0]
    t_3_posts_authors  = level_1_comments.groupby('parent_id')['author'].apply(list).reset_index(name='author')
    t_3_posts_authors['sub_comments'] = t_3_posts_authors['author'].progress_apply(lambda x:len(x))
    
    posts = comments[comments['parent_id'] == 0]
    t3_name_id = {}
    for i in range(posts.shape[0]):
        key = posts.id.iloc[i]
        value = posts.author.iloc[i]

        key_value_pair = {key:value}

        t3_name_id.update(key_value_pair)    
    return t3_name_id, t_3_posts_authors

# For t1_df:

In [10]:

def create_t1_df(dataframe):
    """
    A function that creates an output dataframe for us to create the edge list later
    Args:
        daydf (pandas.dataframe): a DAY dataframe to be examined

    Returns:
        main_authors: a list of authors the gives the main post
        output_df: the output dataframe we want
    """
    level_1_comments = dataframe[dataframe['parent_id'] == dataframe['link_id']]

    sub_comments = dataframe[~dataframe.index.isin(level_1_comments.index)] #setting the sub comments being those with index that are not in level_1_comments

    t1_df  = sub_comments.groupby('parent_id')['author'].apply(list).reset_index(name='author')
    t1_df['sub_comments'] = t1_df['author'].progress_apply(lambda x:len(x))
    t1_df.sort_values('sub_comments', inplace=True, ascending=False)

    t1_name_id = {}
    for i in range(sub_comments.shape[0]):
        key = sub_comments.id.iloc[i]
        value = sub_comments.author.iloc[i]

        key_value_pair = {key:value}

        t1_name_id.update(key_value_pair)    

    return t1_name_id, t1_df

In [11]:
def is_main(dataframe, t1_name_id, t3_name_id):
    main_list = []
    parent_author_list = []
    dataframe = dataframe.reset_index(drop = True)
    
    for index, row in dataframe.iterrows():
        uid = row['parent_id'].split('_')[-1]
        
        pre = row['parent_id'].split('_')[0]
        if pre == 't1':
            main_list.append(0)
            try:
                parent_author = t1_name_id[uid]
                parent_author_list.append(parent_author)
            except:
                parent_author_list.append(np.nan)
        else:
            main_list.append(1)
            try:
                parent_author = t3_name_id[uid]
                parent_author_list.append(parent_author)
            except:
                parent_author_list.append(np.nan)        

        if index == 100000: print(index)
    dataframe['main_post'] =  main_list
    dataframe['parent_author'] = parent_author_list
    return dataframe

In [12]:
def get_edge_list(outputdf, daydf): 
    source_list = []
    target_list = []
    is_main_list = []
    non_singleton = set()
    potential_singleton = set()
    non_single_post = set()
    
    authordf = set(daydf[daydf['parent_id']==0]['author'])
    for index, row in outputdf.iterrows():
        target = row['parent_author']
        non_single_post.add(target)
        author_list = row['author']
        post_type = row['parent_id'].split('_')[0]
        if post_type == 't1':
            main_post = 0
        else:
            main_post = 1
        if target != np.nan:
            non_singleton.add(target)
            for author in author_list:
                source_list.append(author)
                target_list.append(target)
                non_singleton.add(author)
                is_main_list.append(main_post)
        else:
            for author in author_list:
                potential_singleton.add(author)
    single_main = authordf - non_single_post
    singletons = set(potential_singleton) - set(non_singleton)
    singletons = singletons.union(single_main)
    for singleton in singletons:
        source_list.append(singleton)
        target_list.append(singleton)
        is_main_list.append(np.nan)
    
    edge_list = pd.DataFrame(list(zip(source_list, target_list, is_main_list)), columns = ['Source', 'Target', 'is_main'])
    
    return edge_list

In [18]:
#month walker
def month_walker(comment_df, post_df, year, month):
    month_info = dict()
    concat_df = merge_df(comment_df, post_df)
    #clean dataframe
    cleaned_df = full_clean(concat_df)
    #get the output df list
    df_list = create_subframe(year, month, cleaned_df)
    day = 0
    for daydf in df_list:
        day += 1
        #get t3_df
        t3_name_id, t3_posts_authors = create_level_1(daydf)
        
        #get t1_df
        t1_name_id, t1_df = create_t1_df(daydf)
        
        #concat the t1 and t3 df to get the complete dataframe
        full_df = pd.concat([t1_df, t3_posts_authors])
        full_df = full_df.reset_index(drop=True)
        complete_df = is_main(full_df, t1_name_id, t3_name_id)
        #get edge_list   
        edge_list = get_edge_list(complete_df, daydf)
        edge_list = edge_list[['Source', 'Target']]
        edge_list = edge_list.groupby(edge_list.columns.tolist()).size().reset_index().rename(columns={0:'weight'})
        edge_list['inverse_weight'] = 1/edge_list['weight']
        #compute network parameters (potentially merge on name)

        G = nx.from_pandas_edgelist(edge_list, source = "Source", target = "Target", edge_attr= ["weight", "inverse_weight"], create_using = nx.DiGraph()) #Returns a graph from Pandas DataFrame containing an edge list.
        
        # get the list of post author names
        daydf = daydf[daydf['parent_id']==0]
        post_authors = daydf['author'].to_numpy()
        
        # these two attr only for post nodes
        avg_neighbour_deg = nx.average_neighbor_degree(G, weight="weight", nodes=post_authors)
        avg_neighbour_deg = pd.DataFrame(avg_neighbour_deg.items(), columns=['author', 'avg_neighbout_degree'])
        
        cluster_coeff = nx.clustering(G, nodes=post_authors, weight='weight')
        cluster_coeff = pd.DataFrame(cluster_coeff.items(), columns=['author', 'cluster_coeff'])
        
        post_node_merged_df = avg_neighbour_deg.merge(cluster_coeff)
        
        
        # these three attr for all nodes
        in_deg_cent = nx.in_degree_centrality(G)
        in_deg_cent = pd.DataFrame(in_deg_cent.items(), columns=['author', 'in_deg_cent'])
        
        out_deg_cent = nx.out_degree_centrality(G)
        out_deg_cent = pd.DataFrame(out_deg_cent.items(), columns=['author', 'out_deg_cent'])
        
        eigenvec_cent = nx.eigenvector_centrality(G, max_iter = 500, weight='weight')
        eigenvec_cent = pd.DataFrame(eigenvec_cent.items(), columns=['author', 'eigenvec_cent'])
        
        all_node_merged_df = in_deg_cent.merge(out_deg_cent)
        all_node_merged_df = all_node_merged_df.merge(eigenvec_cent)
        
        all_node_merged_df = all_node_merged_df.merge(post_node_merged_df, on=['author'], how='left')
        
        # these attr is done within for-loop for every post node
        ego_graph_density_dict = dict()
        closeness_cent_dict = dict()
        D = G.reverse()
        for node in post_authors:
            ego_graph = nx.ego_graph(D, node, undirected=False, distance='inverse_weight')
            ego_graph_density = nx.density(ego_graph)

            closeness_cent = nx.closeness_centrality(G, u = node, distance='inverse_weight')
            
            ego_graph_density_dict.update({node:ego_graph_density})
            closeness_cent_dict.update({node:closeness_cent})
        
        closeness_cent = pd.DataFrame(closeness_cent_dict.items(), columns=['author', 'closeness_cent'])
        ego_graph_density = pd.DataFrame(ego_graph_density_dict.items(), columns=['author', 'ego_graph_density'])
        post_node_merged_df = closeness_cent.merge(ego_graph_density)
        
        # this gives a dataframe for all the nodes' attr. The missing attr for post nodes shoulg have value nan.
        all_node_merged_df = all_node_merged_df.merge(post_node_merged_df, on=['author'], how='left') 
        
        
        #this saves the df to the month network list
        month_info.update({day : all_node_merged_df})
    return month_info

----------
<h1>Test</h1>

In [13]:
fp_comment = '/Users/victor/Desktop/CS/UST_summer/01-20_comments.csv'
# fp_post = '/Users/jerrylin/Downloads/all_posts_clean_1.csv'
# post = pd.read_csv(fp_post, low_memory = False)[['author', 'id', 'created_utc']]
comment = pd.read_csv(fp_comment, low_memory = False)[['author', 'id', 'created_utc', 'link_id', 'parent_id']]


In [14]:
comment.head()

Unnamed: 0,author,id,created_utc,link_id,parent_id
0,__TSLA__,fconp1p,1577836804,t3_ei988r,t1_fcocbfb
1,lerakk,fconpq2,1577836818,t3_ei8tl1,t1_fcon93n
2,optionseller,fconr5l,1577836849,t3_ei96zu,t3_ei96zu
3,Wolf_Of_1337_Street,fconrdc,1577836853,t3_eiatp9,t3_eiatp9
4,G00dAndPl3nty,fconsd2,1577836873,t3_efwy0l,t1_fcazzwu


In [None]:
month_info = month_walker(comment, post, 2020, 1)

In [None]:

with open('jan2020dailynetwork.pickle', 'wb') as handle:
    pickle.dump(month_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
month_info[1].to_csv('/Users/jerrylin/Downloads/01012020.csv')

In [None]:
month_info[1]

---------
<h1>Run with 1.5 yrs files</h1>

In [23]:
fp_comment = '/Users/victor/Desktop/CS/UST_summer/03-20_comments.csv'
comment = pd.read_csv(fp_comment, low_memory = False)[['author', 'id', 'created_utc', 'link_id', 'parent_id']]
fp_post = '/Users/victor/Desktop/CS/UST_summer/all_posts_clean_1.csv'
post = pd.read_csv(fp_post, low_memory = False)[['author', 'id', 'created_utc']]

In [24]:
month_info = month_walker(comment, post, 2020, 3)

  0%|          | 0/3321731 [00:00<?, ?it/s]

  0%|          | 0/455 [00:00<?, ?it/s]

  0%|          | 0/13993 [00:00<?, ?it/s]

  0%|          | 0/566 [00:00<?, ?it/s]

  0%|          | 0/25583 [00:00<?, ?it/s]

  0%|          | 0/540 [00:00<?, ?it/s]

  0%|          | 0/23145 [00:00<?, ?it/s]

  0%|          | 0/431 [00:00<?, ?it/s]

  0%|          | 0/20994 [00:00<?, ?it/s]

  0%|          | 0/463 [00:00<?, ?it/s]

  0%|          | 0/21881 [00:00<?, ?it/s]

  0%|          | 0/514 [00:00<?, ?it/s]

  0%|          | 0/24729 [00:00<?, ?it/s]

  0%|          | 0/402 [00:00<?, ?it/s]

  0%|          | 0/12741 [00:00<?, ?it/s]

  0%|          | 0/366 [00:00<?, ?it/s]

  0%|          | 0/13979 [00:00<?, ?it/s]

  0%|          | 0/719 [00:00<?, ?it/s]

  0%|          | 0/33265 [00:00<?, ?it/s]

  0%|          | 0/581 [00:00<?, ?it/s]

  0%|          | 0/27739 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

  0%|          | 0/28400 [00:00<?, ?it/s]

  0%|          | 0/1012 [00:00<?, ?it/s]

  0%|          | 0/43730 [00:00<?, ?it/s]

  0%|          | 0/971 [00:00<?, ?it/s]

  0%|          | 0/44731 [00:00<?, ?it/s]

  0%|          | 0/599 [00:00<?, ?it/s]

  0%|          | 0/22589 [00:00<?, ?it/s]

  0%|          | 0/747 [00:00<?, ?it/s]

  0%|          | 0/24172 [00:00<?, ?it/s]

  0%|          | 0/1716 [00:00<?, ?it/s]

  0%|          | 0/46271 [00:00<?, ?it/s]

  0%|          | 0/788 [00:00<?, ?it/s]

  0%|          | 0/36742 [00:00<?, ?it/s]

  0%|          | 0/914 [00:00<?, ?it/s]

  0%|          | 0/44263 [00:00<?, ?it/s]

  0%|          | 0/863 [00:00<?, ?it/s]

  0%|          | 0/41379 [00:00<?, ?it/s]

  0%|          | 0/851 [00:00<?, ?it/s]

  0%|          | 0/39481 [00:00<?, ?it/s]

  0%|          | 0/452 [00:00<?, ?it/s]

  0%|          | 0/18619 [00:00<?, ?it/s]

  0%|          | 0/446 [00:00<?, ?it/s]

  0%|          | 0/21138 [00:00<?, ?it/s]

  0%|          | 0/644 [00:00<?, ?it/s]

  0%|          | 0/37351 [00:00<?, ?it/s]

  0%|          | 0/651 [00:00<?, ?it/s]

  0%|          | 0/36368 [00:00<?, ?it/s]

  0%|          | 0/673 [00:00<?, ?it/s]

  0%|          | 0/37067 [00:00<?, ?it/s]

  0%|          | 0/723 [00:00<?, ?it/s]

  0%|          | 0/38308 [00:00<?, ?it/s]

  0%|          | 0/612 [00:00<?, ?it/s]

  0%|          | 0/31341 [00:00<?, ?it/s]

  0%|          | 0/363 [00:00<?, ?it/s]

  0%|          | 0/14696 [00:00<?, ?it/s]

  0%|          | 0/322 [00:00<?, ?it/s]

  0%|          | 0/15016 [00:00<?, ?it/s]

  0%|          | 0/464 [00:00<?, ?it/s]

  0%|          | 0/23103 [00:00<?, ?it/s]

In [25]:
with open('mar2020dailynetwork.pickle', 'wb') as handle:
    pickle.dump(month_info, handle, protocol=pickle.HIGHEST_PROTOCOL)