In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from datetime import datetime
import sys

def dailyCount(tws, verbose = True):
    '''
    input_file: data frame, which contains columns: user_id_str, created_at
    verobse: bool: print out some extra info 
    rtype:   data frame with three columns -- users_id_str, created_date, count
    '''
    
    tws = tws.loc[:,['user_id_str','created_at']]
    tws['created_at'] = pd.to_datetime(tws['created_at']).dt.date
    
    #loop through the file, count of tweets based on key (user_id, date)
    all_users = {}
    for i in range(len(tws)):
        (u,k) = tws.iloc[i]
        if u in all_users:
            if k in all_users[u]:
                all_users[u][k] += 1
            else:
                all_users[u][k] =1
        else:
            all_users[u] = {}
            all_users[u][k] = 1
            
    #convert dictionary to dataframe      
    all_users_df = None
    for u in all_users:
        aa = all_users[u]
        df = pd.DataFrame.from_dict(aa, orient='index')
        df['user_id_str'] =  u
        df['created_date'] = df.index
        df['count'] = df.iloc[:,0]
        df = df.loc[:,["user_id_str","created_date","count"]]
        all_users_df = pd.concat([all_users_df,df])           
        
    #output as csv file
    if verbose:
        print ("dataframe shape:", all_users_df.shape,
               "number of users:" + str(len(all_users_df.loc[:,'user_id_str'].unique())), 
               "number of tweets:" + str(sum(all_users_df.loc[:,'count'])))    
    return all_users_df    
def dailyCount_helper(files, i):
    start_time = datetime.now()
    f  = files[i]
    input_file = join("../../data/friends_info/edgelist_Feb27/timelines_csv_simplified/", f)
    tws = pd.read_csv(input_file, dtype =str, index_col = None) # no index
    #tws = tws.iloc[:1000]
    result = dailyCount(tws)
    result.to_csv("../../data/friends_info/edgelist_Feb27/timelines_csv_dailycounts/"+f, index = False, header=False) 
    #without header, index, easier to combine.
    print(files[i]+" is done.  time lapse: "+str(datetime.now() - start_time))
    
'''
if __name__ == '__main__':
    i = sys.argv[1]
    files = listdir("../../data/friends_info/edgelist_Feb27/timelines_csv_simplified/")
    files.sort()    
    dailyCount_helper(files, i)   
'''

'\nif __name__ == \'__main__\':\n    i = sys.argv[1]\n    files = listdir("../../data/friends_info/edgelist_Feb27/timelines_csv_simplified/")\n    files.sort()    \n    dailyCount_helper(files, i)   \n'

In [2]:
from joblib import Parallel, delayed
import multiprocessing
# what are your inputs, and what operation do you want to
# perform on each input. For example...

files = listdir("../../data/friends_info/edgelist_Feb27/timelines_csv_simplified/")
files.sort()
num_cores = multiprocessing.cpu_count()
results = Parallel(n_jobs=3) (delayed(dailyCount_helper)(files, i) for i in range(len(files)))

dataframe shape: (1148811, 3) number of users:3476 number of tweets:6598916
t01_part3.csv is done.  time lapse: 0:14:23.670432
dataframe shape: (1738252, 3) number of users:5088 number of tweets:9648925
t01_part4.csv is done.  time lapse: 0:24:57.855429
dataframe shape: (2836409, 3) number of users:6750 number of tweets:15250554
t01_part2.csv is done.  time lapse: 0:42:20.720249
dataframe shape: (3812705, 3) number of users:9123 number of tweets:21913328
t01_part1.csv is done.  time lapse: 1:03:17.325535
dataframe shape: (2300468, 3) number of users:5771 number of tweets:12087287
t02_part2.csv is done.  time lapse: 0:30:09.380160
dataframe shape: (1300582, 3) number of users:3612 number of tweets:6867563
t02_part3.csv is done.  time lapse: 0:14:18.819836
dataframe shape: (1872823, 3) number of users:5037 number of tweets:10112052
t02_part4.csv is done.  time lapse: 0:26:58.150911
dataframe shape: (4147442, 3) number of users:10093 number of tweets:21866483
t02_part1.csv is done.  time 