In [10]:
import pandas as pd
import numpy as np
import csv

In [7]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn import svm
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Import Text Features

In [201]:
df_text = pd.read_csv('text_features.csv')
del df_text['Unnamed: 0']
df_text['node1'] = df_text['node1'].astype(str)
df_text['node2'] = df_text['node2'].astype(str)

# Import Graph Features

In [172]:
import re
def number(x):
    return re.sub("[^0-9]", "", x)

def clean_df_nodes(d):
    d.iloc[:,0] = d.iloc[:,0].apply(lambda x : number(x))
    d.iloc[:,1] = d.iloc[:,1].apply(lambda x : number(x))
    d.columns = ['source', 'target']
    return d

In [189]:
with open("training_labels_0_307756.txt", "r") as f:
    reader = csv.reader(f, delimiter=' ')
    training_labels_1 = list(reader)
    
with open("training_labels_307756_615512.txt", "r") as f:
    reader = csv.reader(f, delimiter=' ')
    training_labels_2  = list(reader)
    
with open("training_graph_features_0_307756.txt", "r") as f:
    reader = csv.reader(f, delimiter=' ')
    training_feat_1  = list(reader)
    
with open("training_graph_features_307756_615512.txt", "r") as f:
    reader = csv.reader(f, delimiter=' ')
    training_feat_2  = list(reader)

with open("training_labels_0_307756.txt", "r") as f:
    reader = csv.reader(f)
    training_nodes_1  = list(reader)

df_nodes_1 = pd.read_csv('training_nodes_0_307756.csv', sep=' ', header=None)
df_nodes_1 = clean_df_nodes(df_nodes_1)
df_nodes_2 = pd.read_csv('training_nodes_307756_615512.csv', sep=' ', header=None)
df_nodes_2 = clean_df_nodes(df_nodes_2)
df_nodes = pd.concat([df_nodes_1, df_nodes_2], axis=0)
df_nodes.reset_index(inplace=True)


df_1 = pd.DataFrame.from_records(np.array(training_feat_1))
df_2 = pd.DataFrame.from_records(np.array(training_feat_2))
df_feat = pd.concat([df_1, df_2], axis=0)
df_feat.reset_index(inplace=True)

df_labels = pd.DataFrame.from_records(np.array(training_labels_1), columns=['label'])
df_labels.label = df_labels.label.apply(lambda x: int(float(x)))

df_graph = pd.concat([df_nodes, df_feat, df_labels], axis=1)
del df_graph['index']
df_graph.columns = ['node1', 'node2', 'graph_0', 'graph_1', 'graph_2', 'graph_3', 'graph_4', 'graph_5', 'label']


# Merging Features

In [204]:
df_merged = df_text.merge(df_graph, on=['node1', 'node2'])

In [205]:
df_merged.shape

(62, 14)

In [182]:
del df_graph['index']

df_graph.columns = ['source', 'target', 'graph_0', 'graph_1', 'graph_2', 'graph_3', 'graph_4', 'graph_5', 'label']


In [150]:
with open("training_nodes_0_307756.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

In [158]:
d = pd.read_csv('training_nodes_0_307756.csv', sep=' ', header=None)

In [169]:
d.iloc[:,0] = d.iloc[:,0].apply(lambda x : number(x))
d.iloc[:,1] = d.iloc[:,1].apply(lambda x : number(x))
d.columns = ['source', 'target']

In [171]:
d.shape

(31, 2)

In [63]:
df_l = pd.DataFrame.from_records(np.array(training_labels_1))
df_l.iloc[:,0] = df_l.iloc[:,0].apply(lambda x: int(float(x)))


In [64]:
df_l

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,0
...,...
180,1
181,0
182,1
183,1


In [61]:
float('1.000000000000000000e+00')

1.0

In [4]:
X_train = df_text.iloc[]

Unnamed: 0.1,Unnamed: 0,node1,node2,author_overlap,title_overlap,abstract_overlap,cos_sim,is_link
615507,615507,207180,9912293,0,0,2,0.044665,1
615508,615508,208203,9912293,0,0,4,0.08444,1
615509,615509,9509019,9912293,1,0,4,0.017692,0
615510,615510,9903127,9912293,1,0,0,0.0,0
615511,615511,9903147,9912293,0,0,3,0.021835,0


In [None]:
kf = StratifiedKFold(n_splits=3, shuffle=True)

In [8]:
def train_predict_save(model, X_train, X_val, y_train, y_val):
    fscore_train = f1_score(y_train,
                        model.predict(X_train))
    fscore_val = f1_score(y_val,
                        model.predict(X_val))

    return(fscore_train, fscore_val)

def eval_model(model, X_train, y_train, idx):
    predicts_t = []
    predicts_v = []
    for train_features, training_labels in kf.split(train_features, training_labels):
        n = int(0.1*(len(train_index)+len(test_index)))
        random.shuffle(test_index)
        test_index_new = test_index[:n]
        train_index_new = np.union1d(test_index[n:], train_index)

        X_train, X_val = train_features.iloc[train_index_new], train_features.iloc[test_index_new]
        y_train, y_val = training_labels.iloc[train_index_new], training_labels.iloc[test_index_new]
        
        if idx == 0:
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                    early_stopping_rounds=50, verbose=None)
        else:
            model.fit(X_train, y_train)
        fscore_t, fscore_v = train_predict_save(model, X_train, X_val, y_train, y_val)
        predicts_t.append(fscore_t)
        predicts_v.append(fscore_v)
        
    return(predicts_t, predicts_v)

In [None]:
modelGB = lgb.LGBMClassifier(objective='binary', reg_lambda=config.reg_lambda_gb,
                           n_estimators=config.n_estimator_GB  )
modelRF = RandomForestClassifier(n_estimators=500)
modelSVM = svm.LinearSVC()
modelL = LogisticRegression()

res_t = {}
res_v = {}
for idx, model in enumerate([modelGB, modelRF, modelSVM, modelL]):
    print(idx)
    (fscore_t, fscore_v) = eval_model(model, train_features, training_labels, idx)
    res_t[idx] = fscore_t
    res_v[idx] = fscore_v
