In [9]:
import pandas as pd
import numpy as np
import re
import os

In [10]:
sep = os.path.sep
path = 'Datasets' + sep + 'Twitter' + sep
filename = 'twitter-larger.tsv'

In [11]:
def load_data():
    columns = ['timestamp', 'user', 'tweet']
    df = pd.read_table(path + filename, header=None)
    df.columns = columns
    return df

In [12]:
def create_user_hashtag_df(df):
    columns = ['user', 'hashtag', 'timestamp']
    users_hashtag = []
    
    def extract_hash_tags(tweet):
        return set([re.sub(r"(\W+)$", "", j) for j in set([i for i in tweet.split() if i.startswith("#")])])

    for index, row in df.iterrows():
        tweet = row['tweet']
        hashtags = extract_hash_tags(tweet)
        if(len(hashtags) > 0):
            for hashtag in hashtags:
                if(len(hashtag) == 0):
                    continue
                hashtag = hashtag[1:].lower()
                comb = (row['user'], hashtag, row['timestamp'])
                users_hashtag.append(comb)
    
    users_hashtag_df = pd.DataFrame(users_hashtag, columns=columns)
    
    return users_hashtag_df

In [13]:
def create_raw_source_target_df(df):
    lst = df['hashtag'].unique()
    users_hashtag = []

    for i in lst:
        hashtag_df = df[df['hashtag'] == i].reset_index()
        copy_hashtags = hashtag_df.copy()
        copy_hashtags = copy_hashtags.iloc[1:,:]
        for index_1, row_1 in hashtag_df.iterrows():
            length = len(copy_hashtags)
            for i in range(length):
                if(len(copy_hashtags) == 0):
                    continue
                max_ts = max(row_1['timestamp'], copy_hashtags.iloc[0,3])
                users = sorted((row_1['user'], copy_hashtags.iloc[0,1]))
                comb = (users[0], users[1], row_1['hashtag'], max_ts)
                users_hashtag.append(comb)
                copy_hashtags = copy_hashtags.iloc[1:,:]
    raw_source_target_df = pd.DataFrame(users_hashtag, columns=['source', 'target', 'hashtag', 'timestamp'])
    
    return raw_source_target_df

In [14]:
def create_clean_graph(df):
    df = df.sort_values(by='timestamp').reset_index().drop(columns=['index'])
    df['newcol'] = df.apply(lambda x: str(x.source) + str(x.target), axis=1)
    df = df[~df.newcol.duplicated()].iloc[:,:4]
    df.columns = ['source', 'target', 'hashtag', 'timestamp']
    df.to_csv(path + 'graph_with_hashtag_' + filename[:-4] + '.csv', index=False)
    df = df.drop(columns=['hashtag'])
    df.to_csv(path + 'graph_' + filename[:-4] + '.csv', index=False)
    
    return df

In [15]:
def run():
    df = load_data()
    user_hashtag_df = create_user_hashtag_df(df)
    raw_source_target_df = create_raw_source_target_df(user_hashtag_df)
    clean_graph = create_clean_graph(raw_source_target_df)

In [16]:
run()