In [None]:
import random
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
twitter_df = pd.read_csv('twitter_graph.csv', header=None, names=['Follower','Target'])
twitter_df.head(10)

twitter_df.describe()

In [None]:
#top 50 users with the highest number of followings
F_vc = pd.DataFrame(twitter_df['Follower'].value_counts().iloc[:50])
F_vc = F_vc.reset_index()
F_vc['index'] = F_vc['index'].apply(lambda x:str(x))
F_vc = F_vc.set_index('index')

_, axes = plt.subplots(1,1, figsize=(6,4))
#F_vc.plot() ;

F_vc.plot(ax=axes, color='orangered')

axes.set(xlabel="User id", ylabel="Number of followings")
axes.set_title('Top 50 users with highest number of followings')
axes.get_legend().remove()

plt.tight_layout()

In [None]:
F_vc = pd.DataFrame(twitter_df['Target'].value_counts().iloc[:50])
F_vc = F_vc.reset_index()
F_vc['index'] = F_vc['index'].apply(lambda x:str(x))
F_vc = F_vc.set_index('index')

In [None]:
#top 50 users with highest number of followers
_, axes = plt.subplots(1,1, figsize=(6,4))
#F_vc.plot() ;

F_vc.plot(ax=axes, color='orangered')

axes.set(xlabel="User id", ylabel="Number of Followers")
axes.set_title('Top 50 users with highest number of followers')
axes.get_legend().remove()

plt.tight_layout()

In [None]:
G = nx.from_pandas_edgelist(twitter_df, 'Follower', 'Target', create_using=nx.DiGraph())

def plot_degree_dist(G):
    degrees = [G.degree(n) for n in G.nodes()]
    plt.hist(degrees)
    plt.show()

plot_degree_dist(G)

#a function to create a list of the frequency of each degree value.
def degree_histogram_directed(G, in_degree=False, out_degree=False):
    nodes = G.nodes()
    if in_degree:
        in_degree = dict(G.in_degree())
        degseq=[in_degree.get(k,0) for k in nodes]
    elif out_degree:
        out_degree = dict(G.out_degree())
        degseq=[out_degree.get(k,0) for k in nodes]
    else:
        degseq=[v for k, v in G.degree()]
    dmax=max(degseq)+1
    freq= [ 0 for d in range(dmax) ]
    for d in degseq:
        freq[d] += 1
    return freq

In [None]:
#plot in degree distribution
in_degree_freq = degree_histogram_directed(G, in_degree=True)
degrees = range(len(in_degree_freq))
plt.figure(figsize=(6, 4)) 
plt.loglog(range(len(in_degree_freq)), in_degree_freq, 'o', label='in-degree',color='orangered',markersize=2) 
plt.xlabel('Degree')
plt.ylabel('Frequency')

In [None]:
#plot out degree distribution
out_degree_freq = degree_histogram_directed(G, out_degree=True)
degrees = range(len(in_degree_freq))
plt.figure(figsize=(6, 4)) 
plt.loglog(range(len(out_degree_freq)), out_degree_freq, 'o', label='out-degree', color='orangered', markersize=2) 
plt.xlabel('Degree')
plt.ylabel('Frequency')

In [None]:
#save the graph so we don't have to re-run the code
nx.write_gexf(G, "test.gexf")

#graph statistics
print("Number of nodes:" , nx.number_of_nodes(G))
print("Number of edgess:" , nx.number_of_edges(G))
#print("Degree distribution histogram", nx.degree_histogram(G))
#degree_sequence = sorted([d for n, d in G.degree()], reverse=True)
print("Density" , nx.density(G))
print("Degree assortativity:", nx.degree_assortativity_coefficient(G))
print("Average Clustering Coefficient", nx.average_clustering(G))
print("Number of SCC", nx.number_strongly_connected_components(G))
print("Strongly connected components", nx.strongly_connected_components(G))
print("Number of WCC", nx.number_weakly_connected_components(G))
print("Weakly connected components", nx.weakly_connected_components(G))
print("average shortest path:", nx.average_shortest_path_length(G))
print("Diameter", nx.diameter(G))

In [None]:
import pandas as pd
import networkx as nx

# read in your dataset
df = pd.read_csv('twitter500.csv', header=None, names=['source', 'target', 'label'])
#print(nx.degree(G)[12])
# create a graph from the dataset
G = nx.from_pandas_edgelist(df)

# calculate degree centrality for each node in the graph
degree_centrality = nx.degree_centrality(G)
for i in df['source']:
    df['source_degree'] = nx.degree(G)[i]

for i in df['source']:
    df['source_dc'] = nx.degree_centrality(G)[i]  
    
for i in df['target']:
    df['target_degree'] = nx.degree(G)[i]

for i in df['target']:
    df['target_dc'] = nx.degree_centrality(G)[i] 

# calculate common neighbors for each edge and store in a new column
df['common_neighbors'] = df.apply(lambda x: len(list(nx.common_neighbors(G, x['source'], x['target']))), axis=1)


# save the updated DataFrame to a new CSV file
df.to_csv('data_250.csv', index=False)
df.head(10)

In [None]:
target = df[['label']]
target

In [None]:
#dropping target variables
df = df.drop(['label'], axis = 1)
df

In [None]:
X = df.to_numpy()
Y = target.to_numpy()

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
print ('Train set:', X_train.shape,  Y_train.shape)
print ('Test set:', X_test.shape,  Y_test.shape)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

#Neural Networks (MLP)
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(random_state=0, max_iter=600).fit(X_train, Y_train)
Y_pred = MLP.predict(X_test)

#Confusion Matrix
cm = confusion_matrix(Y_test, Y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=MLP.classes_)
disp.plot(cmap="Blues")

#Evaluation
print("\nMLP")
print('Confusion Matrix:' , cm)
print('Accuracy: ', accuracy_score(Y_test, Y_pred)*100)
print('Precision: ' , precision_score(Y_test, Y_pred)*100)
print('Recall: ', recall_score(Y_test, Y_pred)*100)
print('F-score: ' ,(f1_score(Y_test, Y_pred)*100))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score , matthews_corrcoef

#Naive Bayes
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
Y_pred = NB.fit(X_train, Y_train).predict(X_test)

#Confusion Matrix
cm = confusion_matrix(Y_test, Y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=NB.classes_)
disp.plot(cmap="Blues")

#Evaluation
cm = confusion_matrix(Y_test, Y_pred)
print("\n Naive Bayes")
print('Confusion Matrix:', cm)
print('Accuracy: ', accuracy_score(Y_test, Y_pred)*100)
print('Precision: ' , precision_score(Y_test, Y_pred)*100)
print('Recall: ', recall_score(Y_test, Y_pred)*100)
print('F-score: ' ,(f1_score(Y_test, Y_pred)*100))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef 

#K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors = 100, metric='euclidean').fit(X_train, Y_train)
Y_pred = KNN.predict(X_test)

#Confusion Matrix
cm = confusion_matrix(Y_test, Y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=KNN.classes_)
disp.plot(cmap="Blues")

#Evaluation
print("\nK Nearest Neighbors")
print('Confusion Matrix:' , cm)
print('Accuracy: ', accuracy_score(Y_test, Y_pred)*100)
print('Precision: ' , precision_score(Y_test, Y_pred)*100)
print('Recall: ', recall_score(Y_test, Y_pred)*100)
print('F-score: ' ,(f1_score(Y_test, Y_pred)*100))

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators= 100).fit(X_train, Y_train)
Y_pred = RF.predict(X_test)

#Confusion Matrix
cm = confusion_matrix(Y_test, Y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=RF.classes_)
disp.plot(cmap="Blues")

#Evaluation
print("\nRandom Forest")
print('Confusion Matrix:' , cm)
print('Accuracy: ', accuracy_score(Y_test, Y_pred)*100)
print('Precision: ' , precision_score(Y_test, Y_pred)*100)
print('Recall: ', recall_score(Y_test, Y_pred)*100)
print('F-score: ' ,(f1_score(Y_test, Y_pred)*100))