In [None]:
import os
import pandas as pd
df = pd.read_csv('/content/Trial.txt', sep='\t')
df.head(100)

In [None]:
df.drop(columns=['id'], inplace=True)
df.head()

In [None]:
# Count of unique authors
unique_authors_count = new_df['author'].nunique()
print(f"Total count of unique authors: {unique_authors_count}")

In [None]:
# Count of occurrences for each unique author
author_occurrences = df['author'].value_counts().to_dict()
filtered_author_occurrences = {author: count for author, count in author_occurrences.items() if count >= 1000}
#print(len(filtered_author_occurrences))

sorted_author_occurrences = dict(sorted(filtered_author_occurrences.items(), key=lambda item: item[1], reverse=True))
print(len(sorted_author_occurrences))

# Print the sorted dictionary
for author, count in sorted_author_occurrences.items():
    print(f"{author}: {count}")


In [None]:
authors_to_keep = set(sorted_author_occurrences.keys())

#  Create a new df which includes rows with authors in 'authors_to_include'
new_df = df[df['author'].isin(authors_to_keep)]
new_df

In [None]:
char_replace = ['<"', '">', '\\', 'O\\', 'o\\', '\\\\','"']

new_df['author'] = new_df['author'].astype(str)
new_df['recipient'] = new_df['recipient'].astype(str)
new_df['copied'] = new_df['copied'].astype(str)

for char in char_replace:
    new_df['author'] = new_df['author'].str.replace(char, '', regex=False)
    new_df['recipient'] = new_df['recipient'].str.replace(char, '', regex=False)
    new_df['copied'] = new_df['copied'].str.replace(char, '', regex=False)

In [None]:
def remove_duplicates(text):
    parts = text.split()
    unique_parts = []
    for part in parts:
        if part.lower() not in unique_parts:
            unique_parts.append(part.lower())
    return ' '.join(unique_parts)
new_df['author']= new_df['author'].apply(remove_duplicates)
new_df['recipient'] = new_df['recipient'].apply(remove_duplicates)
new_df['copied'] = new_df['copied'].apply(remove_duplicates)


In [None]:
# Function to convert each row into individual edges
def expand_row(row, edge_type):
    contacts = row[edge_type].split(',') if pd.notna(row[edge_type]) else []
    return [(row['author'].strip(), contact.strip(), edge_type) for contact in contacts]

# Create edges based on 'recipient' and 'copied' fields
edges = []
new_df.apply(lambda row: edges.extend(expand_row(row, 'recipient')), axis=1)
new_df.apply(lambda row: edges.extend(expand_row(row, 'copied')), axis=1)


In [None]:
print(len(edges))

In [None]:
import networkx as nx
import plotly.graph_objects as go
import matplotlib.pyplot as plt

G = nx.MultiDiGraph()

for author, contact, edge_type in edges:
    if author.lower() != 'nan' and contact.lower() != 'nan':
        G.add_edge(author, contact, relation=edge_type, weight=1)

# graph that only includes nodes with degree > 1
filtered_nodes = [node for node in G.nodes if G.degree(node) > 1]
H = G.subgraph(filtered_nodes)

pos = nx.spring_layout(H)

edge_x, edge_y = [], []
for edge in H.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

node_x, node_y, node_info = [], [], []
for node in H.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    info = f"{node}<br># of connections: {H.degree(node)}"
    node_info.append(info)

node_trace = go.Scatter(x=node_x, y=node_y, mode='markers', hoverinfo='text',
                        text=node_info, marker=dict(showscale=True, colorscale='YlGnBu', size=10,
                                                    color=[H.degree(n) for n in H.nodes()], line_width=2))

fig = go.Figure(data=[go.Scatter(x=edge_x, y=edge_y, mode='lines', line=dict(color='grey', width=0.3)),
                      node_trace],
                layout=go.Layout(showlegend=False, hovermode='closest', xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                                 yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), plot_bgcolor='white'))

fig.show()