In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_list(file):
    """
    get twitter user from csv and return a list ie["@aaa","@bbb","@ccc"]
    :param df: a csv file
    :rtype: a list
    """
    try:
        # try to read csv file
        df = pd.read_csv(file)
        name_list = df["Source"].tolist()
        tags = df["Tags"].tolist()
        return (name_list, tags)
    except(Exception):
        print("Input file is not csv or doesn't exist such csv")
        exit(1)

In [3]:
def pre_processor(file):
    """
    reshape and combine all csvs which generate by twitter crawler,
    the combination of csv store as one file and named final.csv
    :param df: a csv file
    :rtype: None
    """
    # create an empty dataframe for final output
    header_list = ['Hit Record Unique ID', 
                   "URL to article/Tweet", 
                   "Source",
                   "Location",
                   "Hit Type",
                   "Passed through tags",
                   "Associated Publisher",
                   "Referring Hit Record Unique ID",
                   "Authors",
                   "Plain Text of Article or Tweet",
                   "Date",
                   "Mentions",
                   "Hashtags",
                   "Found URL"]

    Final = pd.DataFrame(columns=header_list)
    # get the sourse list and tags list
    (list_name, tags) = get_list("twitter.csv") 
    # reshape each twitter user's tweets output and add it into final dataframe
    for i in range(len(list_name)):
        txtname = list_name[i].split('@')[1]
        try:
            print(txtname)
            tweet = pd.read_csv("csv/" +txtname+".csv", low_memory=False)
            print(len(tweet))
            retweet = pd.DataFrame({'Hit Record Unique ID': tweet["id"].tolist(),
                                    "URL to article/Tweet": tweet["link"].tolist(),
                                    "Source": list_name[i],
                                    "Location": tweet["place"].tolist(),
                                    "Hit Type": "Twitter Handle",
                                    "Passed through tags": tags[i],        
                                    "Associated Publisher": np.nan,
                                    "Referring Hit Record Unique ID": np.nan,
                                    "Authors": tweet["name"].tolist(),
                                    "Plain Text of Article or Tweet": tweet["tweet"].tolist(),
                                    "Date": tweet["date"].tolist(),
                                    "Mentions": tweet["mentions"].tolist(),
                                    "Hashtags": tweet["hashtags"].tolist(),
                                    "Found URL": tweet["urls"].tolist()})
            Final = Final.append(retweet,sort=False)
        except(Exception):
            pass
    # store dataframe as csv
    Final.to_csv('final.csv', index=False, encoding='utf-8-sig')    
      


In [4]:
if __name__ == '__main__':
    header_list = ['Hit Record Unique ID', 
                   "URL to article/Tweet", 
                   "Source",
                   "Location",
                   "Name",
                   "Hit Type",
                   "Passed through tags",
                   "Associated Publisher",
                   "Referring Hit Record Unique ID",
                   "Authors",
                   "Plain Text of Article or Tweet",
                   "Date",
                   "Mentions",
                   "Hashtags",
                   "Found URL"]

    Final = pd.DataFrame(columns=header_list)


In [5]:
(list_name,tags) = get_list("twitter.csv") 

dt_set = {'id': int,                         
          'conversation_id': int,
          'created_at': object,
          'date': object,
          'time': object,
          'timezone': int,
          'user_id': int,
          'username': object,
          'name': object,
          'place': float,
          'tweet': object,
          'language': object,
          'mentions': object,
          'urls': object,
          'photos': object,
          'replies_count': int,
          'retweets_count': int,
          'likes_count': int,
          'hashtags': object,      
          'cashtags': object,
          'link': object,
          'retweet': float,
          'quote_url': float,
          'video': int,
          'thumbnail': float,
          'near': float,
          'geo': float,
          'source': float,
          'user_rt_id': float,
          'user_rt': float,
          'retweet_id': float,
          'reply_to': object,
          'retweet_date': float,
          'translate': float,
          'trans_src': float,
          'trans_dest': float}

In [8]:
for i in range(len(list_name)):
    txtname = list_name[i].split('@')[1]
    try:
        print(txtname)
        tweet = pd.read_csv("csv/" +txtname+".csv", low_memory=False)
        print(len(tweet))
        retweet = pd.DataFrame({'Hit Record Unique ID': tweet["id"].tolist(),
                                "URL to article/Tweet": tweet["link"].tolist(),
                                "Source": list_name[i],
                                "Location": tweet["place"].tolist(),
                                "Hit Type": "Twitter Handle",
                                "Passed through tags": tags[i],        
                                "Associated Publisher": np.nan,
                                "Referring Hit Record Unique ID": np.nan,
                                "Authors": tweet["name"].tolist(),
                                "Plain Text of Article or Tweet": tweet["tweet"].tolist(),
                                "Date": tweet["date"].tolist(),
                                "Mentions": tweet["mentions"].tolist(),
                                "Hashtags": tweet["hashtags"].tolist(),
                                "Found URL": tweet["urls"].tolist()})
        Final = Final.append(retweet, sort=False)
    except(Exception):
        pass
        


Claudia_Kealoha
elderofziyon
68265
WarpedMirrorPMB
35845
GippersChutzpah
Jonathan_Elk
2215
3moYahya
1410
Amani90__
13562
RedaMansour
7234
z7654321
28648
Amani90__
13562
littlebytesnews
1012798
wisdomforwomen
58541
RachelSteinmetz
22310
IsraelNewsLinks
890272
BobSeaPort
10081
Pucemargine
10951
genevaaccord
4792
Spokoiny
2626
HananyaNaftali
36718
adambasciano
4570
AvivEzra
1543
DaniellaNLevy
958
a_zionist
66852
IsraelandStufff
14870


In [9]:
Final.to_csv('final.csv', index=False, encoding='utf-8-sig')
Final

Unnamed: 0,Hit Record Unique ID,URL to article/Tweet,Source,Location,Name,Hit Type,Passed through tags,Associated Publisher,Referring Hit Record Unique ID,Authors,Plain Text of Article or Tweet,Date,Mentions,Hashtags,Found URL
0,1320749411086073857,https://twitter.com/elderofziyon/status/132074...,@elderofziyon,,,Twitter Handle,Twitter Journalists,,,Elder of Ziyon 🇮🇱,Peter Beinart has literally no idea what he is...,2020-10-26,[],[],[]
1,1320742324687785985,https://twitter.com/elderofziyon/status/132074...,@elderofziyon,,,Twitter Handle,Twitter Journalists,,,Elder of Ziyon 🇮🇱,NYPD bias crimes word cloud shows JEWISH as by...,2020-10-26,[],[],[]
2,1320733975896346627,https://twitter.com/elderofziyon/status/132073...,@elderofziyon,,,Twitter Handle,Twitter Journalists,,,Elder of Ziyon 🇮🇱,"@ShreeParadkar You call these academics ""moder...",2020-10-26,"['shreeparadkar', 'torontostar']",[],[]
3,1320731645381615620,https://twitter.com/elderofziyon/status/132073...,@elderofziyon,,,Twitter Handle,Twitter Journalists,,,Elder of Ziyon 🇮🇱,New York Police Department word cloud of bias ...,2020-10-26,[],[],[]
4,1320731313272475649,https://twitter.com/elderofziyon/status/132073...,@elderofziyon,,,Twitter Handle,Twitter Journalists,,,Elder of Ziyon 🇮🇱,"@Vandalay_Inc Works for me, weird.",2020-10-26,['vandalay_inc'],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14865,509440986770964481,https://twitter.com/IsraelandStufff/status/509...,@IsraelandStufff,,,Twitter Handle,Twitter Journalists,,,✡Israel and Stuff✡,@JIDFfan @israel_shield He's a coward....,2014-09-09,"['jidffan', 'israel_shield']",[],[]
14866,509439738617401344,https://twitter.com/IsraelandStufff/status/509...,@IsraelandStufff,,,Twitter Handle,Twitter Journalists,,,✡Israel and Stuff✡,@Rashir @PPasmanick Kick him around all you w...,2014-09-09,"['rashir', 'ppasmanick']",[],[]
14867,509430457134759936,https://twitter.com/IsraelandStufff/status/509...,@IsraelandStufff,,,Twitter Handle,Twitter Journalists,,,✡Israel and Stuff✡,"A CNN Reporter, BBC Reporter, and an Israeli c...",2014-09-09,[],[],['https://www.facebook.com/ISRAELandStuff/phot...
14868,509429305479532544,https://twitter.com/IsraelandStufff/status/509...,@IsraelandStufff,,,Twitter Handle,Twitter Journalists,,,✡Israel and Stuff✡,#Jerusalem rejects New Zealand’s new envoy to ...,2014-09-09,[],"['jerusalem', 'israel']",['http://fb.me/7aRwJjS9Z']
