-
Notifications
You must be signed in to change notification settings - Fork 0
/
experiments.py
138 lines (115 loc) · 5.36 KB
/
experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from sklearn.model_selection import train_test_split
import utils
import preprocessing
import evaluation
import visualization
from feature_extractions import network_feature_extraction, nlp_feature_extractions
import graph
import numpy as np
import XGBoost
import pathlib
import os
import shutil
import pandas as pd
SOURCE = pathlib.Path(__file__).parent
# prepare the data and folders #
posts_data = utils.read_to_df()
users_data = utils.concat_posts_per_user(posts_data)
path_object = pathlib.Path(SOURCE / 'outputs')
if path_object.exists():
shutil.rmtree(SOURCE / 'outputs')
os.makedirs(SOURCE / 'outputs')
# pre process data #
print("pre processing data...")
posts_data = preprocessing.preprocess_text(posts_data)
users_data = preprocessing.preprocess_text(users_data)
# create network with topics #
print("create network")
topics = graph.get_topics(users_data, 0.1, 5)
network_file_name = SOURCE / 'outputs/bullies_network.csv'
graph.create_csv_network_from_topics(network_file_name, topics)
network_graph = graph.create_graph(network_file_name)
# # pre process network #
print("pre processing network...")
network_graph = preprocessing.preprocess_graph(network_graph, 0.1) #todo change back to 0.1
graph.graph_attributes(network_graph)
# extract nlp features #
print("extract nlp features...")
feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
X_nlp = nlp_feature_extractions.extract_features(users_data, feature_list)
y_nlp = (users_data['cb_level'] == 3).astype(int)
X_users = nlp_feature_extractions.extract_number_of_posts(posts_data)
X_nlp = X_nlp.merge(X_users, on='writer')
# extract network features #
print("extract network features...")
feature_list = ['clustering_coefficient', 'degree', 'betweenness', 'closeness']
X_network = network_feature_extraction.extract_features(users_data, feature_list, network_graph)
y_network = (users_data['cb_level'] == 3).astype(int)
X_network = X_network.merge(X_users, on='writer')
# combined data frame #
X_combined = pd.merge(X_nlp, X_network, on='writer')
y_combined = (users_data['cb_level'] == 3).astype(int)
# remove writer's column from data frames #
X_nlp = X_nlp.drop(columns=['writer'])
X_network = X_network.drop(columns=['writer'])
X_combined = X_combined.drop(columns=['writer'])
# create train set and test set #
X_nlp_train, X_nlp_test, y_nlp_train, y_nlp_test = train_test_split(X_nlp, y_nlp, test_size=0.2, random_state=42)
X_network_train, X_network_test, y_network_train, y_network_test = train_test_split(X_network, y_network, test_size=0.2, random_state=42)
X_combined_train, X_combined_test, y_combined_train, y_combined_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)
# train the nlp model #
print("train models...")
xgb_obj = XGBoost.XGBoost()
xgb_classifier = xgb_obj.train(X_nlp_train, y_nlp_train)
y_nlp_prob_xgb = xgb_obj.predict(X_nlp_test)
y_nlp_pred_xgb = np.where(y_nlp_prob_xgb > 0.5, 1, 0)
performances_nlp = evaluation.get_performances(y_nlp_test, y_nlp_pred_xgb)
print('nlp performances:')
for k,v in performances_nlp.items():
print(k, v)
# train the network model #
xgb_obj = XGBoost.XGBoost()
xgb_classifier_net = xgb_obj.train(X_network_train, y_network_train)
y_network_prob_xgb = xgb_obj.predict(X_network_test)
y_network_pred_xgb = np.where(y_network_prob_xgb > 0.5, 1, 0)
performances_network = evaluation.get_performances(y_network_test, y_network_pred_xgb)
print('network performances:')
for k,v in performances_network.items():
print(k, v)
# train nlp combined with network model #
xgb_obj = XGBoost.XGBoost()
xgb_classifier_combined = xgb_obj.train(X_combined_train, y_combined_train)
y_combined_prob_xgb = xgb_obj.predict(X_combined_test)
y_combined_pred_xgb = np.where(y_combined_prob_xgb > 0.5, 1, 0)
performances_combined = evaluation.get_performances(y_combined_test, y_combined_pred_xgb)
print('combined performances:')
for k,v in performances_combined.items():
print(k, v)
# evaluation for nlp #
print("evaluation for nlp only")
roc_auc_nlp, fpr_nlp, tpr_nlp = evaluation.get_roc_auc(y_nlp_test, y_nlp_prob_xgb)
visualization.plot_roc_curve(roc_auc_nlp, fpr_nlp, tpr_nlp, 'nlp ROC/AUC')
# evaluation for network #
print("evaluation for network only")
roc_auc_network, fpr_network, tpr_network = evaluation.get_roc_auc(y_network_test, y_network_prob_xgb)
visualization.plot_roc_curve(roc_auc_network, fpr_network, tpr_network, 'network ROC/AUC')
# evaluation for nlp combined with network #
print("evaluation for nlp and network combined")
roc_auc_combined, fpr_combined, tpr_combined = evaluation.get_roc_auc(y_combined_test, y_combined_prob_xgb)
visualization.plot_roc_curve(roc_auc_combined, fpr_combined, tpr_combined, 'combined ROC/AUC')
# comparison for all three #
visualization.plot_models_compare_performance(performances_nlp, performances_network, performances_combined)
#################### Network analysis ##################
# find communities #
print('find communities...')
communities = graph.find_communities(network_graph)
graph.show_communities_graph(network_graph,communities)
# find top ten from centrality #
print('Top 10...')
top_10_between, top_10_closeness, top_10_degree = graph.top_10_centrality()
# find correlation between centrality
print('find correlation...')
graph.correlation(top_10_between, top_10_closeness, top_10_degree)
# check if the graph is scale free
print('Check degree distribution...')
graph.check_power_law(network_graph)