In [1]:
# Import pandas to work with DataFrames
import pandas as pd
# Import matplotlib for plotting graphs
import matplotlib.pyplot as plt
import scipy
# Import json to change the data format
import json
# Import glob to select multiple files
import glob
# Import shutil to copy the content of a source file to a destination file
import shutil
# Import os to mimic operation system functions into python
import os
# Import datetime to interact with dates as date objects
import datetime

In [3]:
# Select the right folder containing the data
os. chdir('/scratch/s5724090/TweetData/TwitterGEDv2')

In [4]:
# Convert the json data to a DataFrame
data_list = []


file_list = glob.glob('*.txt')


for file in file_list:

    with open(file, 'r') as f:
        for line in f:
            try:
                data = json.loads(line)
                data_list.append(data)
            except json.JSONDecodeError:
                print("Error decoding JSON on line:", line)


groningen_complete = pd.json_normalize(data_list)

In [5]:
# Select the necessary columns for social network analysis
sna_columns = groningen_complete[['retweeted_status.user.id_str', 'retweeted_status.user.screen_name', 'retweeted_status.user.name', 'retweeted_status.user.description', 
                                  'user.id_str', 'user.screen_name', 'user.name','user.description' ,'created_at', 'text']]
# Rename the columns to make them unambiguous
sna_columns = sna_columns.rename(columns={'retweeted_status.user.id_str': 'retweeted_user_id', 'retweeted_status.user.screen_name': 'retweeted_user_handle', 
                            'retweeted_status.user.name': 'retweeted_user_display_name', 'retweeted_status.user.description':'retweeted_user_bio','user.id_str': 'retweeter_id', 'user.screen_name': 'retweeter_handle', 
                            'user.name': 'retweeter_display_name','user.description':'retweeter_bio'})

In [8]:
# Make datetime objects of the created_at column
sna_columns['created_at'] = pd.to_datetime(sna_columns['created_at'])

# Format the values to only contain year,  month and day
sna_columns['created_at'] = sna_columns['created_at'].dt.strftime('%Y-%m-%d')

# Turn them into datetime objects again
sna_columns['created_at'] = pd.to_datetime(sna_columns['created_at'])

In [9]:
# Make a function to make a new dataframe that is filtered on the time period you need based on the first year
def filtered_df(dataframe, first_year):
    startdate = pd.to_datetime(f'{first_year}-11-16')
    enddate = pd.to_datetime(f'{first_year+1}-11-15')
    return dataframe.loc[(dataframe['created_at'] >= startdate) & (dataframe['created_at'] <= enddate)]

In [10]:
df_12_13 = filtered_df(sna_columns, 2012)
df_13_14 = filtered_df(sna_columns, 2013)
df_14_15 = filtered_df(sna_columns, 2014)
df_15_16 = filtered_df(sna_columns, 2015)
df_16_17 = filtered_df(sna_columns, 2016)
df_17_18 = filtered_df(sna_columns, 2017)
df_18_19 = filtered_df(sna_columns, 2018)
df_19_20 = filtered_df(sna_columns, 2019)
df_20_21 = filtered_df(sna_columns, 2020)
df_21_22 = filtered_df(sna_columns, 2021)

## Making a nodelist and edgelist to use in Gephi
#### A nodelist needs to contain a unique node ID, a label and it may contain attributes.
#### The unique id's will be the retweeted_user's id and the retweeter_id. The display names will also be necessary in Gephi 
#### Here, I will make a csv file for the nodes and edges for each year in the network in a format in which it can be 
#### worked with in Gephi.

## Creating the edgelists

In [11]:
# Now let's make a function that also saves both to csv files
def create_edgelist(dataframe, filename):
    filename = "".join([c if c.isalnum() or c in (' ', '.', '_') else "_" for c in filename])
    # Edgelist
    edgelist = dataframe[['retweeter_id','retweeted_user_id']].rename(columns={'retweeter_id':'Source', 'retweeted_user_id':'Target'}).dropna().reset_index().drop(['index'], axis=1)
    return edgelist.to_csv(filename, index=False)

In [12]:
create_edgelist(df_12_13, 'gephi_12_13_edges.csv')
create_edgelist(df_13_14, 'gephi_13_14_edges.csv')
create_edgelist(df_14_15, 'gephi_14_15_edges.csv')
create_edgelist(df_15_16, 'gephi_15_16_edges.csv')
create_edgelist(df_16_17, 'gephi_16_17_edges.csv')
create_edgelist(df_17_18, 'gephi_17_18_edges.csv')
create_edgelist(df_18_19, 'gephi_18_19_edges.csv')
create_edgelist(df_19_20, 'gephi_19_20_edges.csv')
create_edgelist(df_20_21, 'gephi_20_21_edges.csv')
create_edgelist(df_21_22, 'gephi_21_22_edges.csv')

## Creating the nodelists

In [14]:
def create_nodelist(dataframe, filename):
    nodelist = pd.concat([
    dataframe[['retweeter_id','retweeter_display_name']].rename(columns={'retweeter_id': 'Id',
                                                                         'retweeter_display_name': 'display name'}),
    dataframe[['retweeted_user_id','retweeted_user_display_name']].rename(columns={'retweeted_user_id': 'Id',
                                                                                   'retweeted_user_display_name': 'display name'
                                                                                  })])
    nodelist = nodelist.drop_duplicates().reset_index(drop=True)
    return nodelist.to_csv(filename, index=False)
    #return nodelist.to_csv(filename, index=False)

In [15]:
create_nodelist(df_12_13, 'gephi_12_13_nodes.csv')
create_nodelist(df_13_14, 'gephi_13_14_nodes.csv')
create_nodelist(df_14_15, 'gephi_14_15_nodes.csv')
create_nodelist(df_15_16, 'gephi_15_16_nodes.csv')
create_nodelist(df_16_17, 'gephi_16_17_nodes.csv')
create_nodelist(df_17_18, 'gephi_17_18_nodes.csv')
create_nodelist(df_18_19, 'gephi_18_19_nodes.csv')
create_nodelist(df_19_20, 'gephi_19_20_nodes.csv')
create_nodelist(df_20_21, 'gephi_20_21_nodes.csv')
create_nodelist(df_21_22, 'gephi_21_22_nodes.csv')

## Non-longitudinal network
### To appreciate the added value of a longitudinal analysis as opposed to analyzing a singular network that represents 10 years, it is interesting to see what the network looks like had it been one network instead of 10. Thus, we create a nodelist and edgelist for the dataframe before it was split on the basis of time. 

In [16]:
create_edgelist(sna_columns, 'gephi_non_split_edges.csv')

In [17]:
create_nodelist(sna_columns, 'gephi_non_split_nodes.csv')