In [None]:
#### Network Analysis of TG channels ####


import os 
from datetime import datetime
import pandas as pd


# read all csv files with startswith 'messages'
files = [f for f in os.listdir() if f.startswith('messages')]
df = pd.concat([pd.read_csv(f) for f in files])


In [None]:
# count the number of unique values in the column 'Forwarder'
print("Number of unique forwarders:", df['Forwarder'].nunique())

# Count how many times each unique value occurs in the column 'Forwarder'
print("Number of times each forwarder occurs:")
display(df['Forwarder'].value_counts())

# create a new temp dataframe where value_counts() is stored
temp = df['Forwarder'].value_counts().reset_index()
temp.columns = ['Forwarder', 'Count']


In [None]:

xy_gr = df.groupby(['Sender', 'Forwarder'])['Forwarder'].count()
xy_gr = xy_gr.to_frame(name="count").reset_index()


### Who communicates the most between each others 
display(xy_gr.sort_values(by='count', ascending=False).head(5))

xy_gr['pairs'] = xy_gr['Sender'] + "|"+ xy_gr['Forwarder']


In [None]:

######## Gephi Data ########

# Edges DF
gephi_edges = xy_gr
gephi_edges.rename(columns = {'Sender':'Source', 
                              'Forwarder':'Target', 
                              'count':'count_weight'}, inplace = True)
display(gephi_edges.head(2))

# create a folder "network_processing_v1" to store the gephi data
if not os.path.exists('network_processing_gephi'):
    os.makedirs('network_processing_gephi')

gephi_edges.to_csv("network_processing_gephi/edges_gephi.csv", index=False)


In [None]:
# Nodes DF (Long Aleph ID + Email)

# make a unique list of all senders and forwarders from the edges dataframe
senders = gephi_edges['Source'].unique()
forwarders = gephi_edges['Target'].unique()
nodes = pd.concat([pd.DataFrame(senders), pd.DataFrame(forwarders)]).reset_index(drop=True)
nodes.columns = ['Label']
nodes['ID'] = nodes.index + 1

# add nodes ID to the edges dataframe
gephi_edges = gephi_edges.merge(nodes, left_on='Source', right_on='Label', how='left')
gephi_edges.rename(columns = {'ID':'Source_ID'}, inplace = True)
#gephi_edges = gephi_edges.drop(columns=['Label'])



In [None]:
### Text Analysis ###

# Create a plot: number of messages per day by each sender
df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = df['Date'].dt.date
date_gr = df.groupby(['Date', 'Sender'])['Sender'].count()
date_gr = date_gr.to_frame(name="count").reset_index()
display(date_gr.head(5))

# Visualize the number of messages per day by each sender
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 6))
sns.lineplot(x='Date', y='count', hue='Sender', data=date_gr)
plt.xticks(rotation=45)
plt.show()


In [None]:
# Visualize the number of messages per day by each sender
import seaborn as sns
import matplotlib.pyplot as plt

# Rewrite this for vega_datasets library
plt.figure(figsize=(15, 6))
sns.lineplot(x='Date', y='count', hue='Sender', data=date_gr)
plt.xticks(rotation=45)
plt.show()
