In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score

In [20]:
def prep_data(path, train_percent):

    to_normalize = ['neighbours_1', 'neighbours_2', 'common_neigbhours', 'total_neigbhours',
                     'prefferential_attachment', 'friends_measure', 'shortest_path']
 
    # to_drop = ['node_1', 'node_2']
    # to_drop = ['node_1', 'node_2', 'shortest_path']
    # to_drop = ['node_1', 'node_2', 'shortest_path', 'density_ego_with_node_1', 'density_ego_with_node_2',
    #             'density_ego_without_node_1', 'density_ego_without_node_2']
    to_drop = ['node_1', 'node_2', 'density_ego_with_node_1', 'density_ego_with_node_2',
                'density_ego_without_node_1', 'density_ego_without_node_2']
    data = pd.read_csv(path)

    
    for col in to_normalize:
        data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())

    data = data.drop(to_drop, axis = 'columns')

    label = data['link_exists']
    train_data = data.drop(['link_exists'], axis = 'columns')
    x_train, x_test, y_train, y_test = train_test_split(train_data, label, test_size=1-train_percent, random_state=42)
    return x_train, y_train, x_test, y_test


In [21]:
model_performance_data = pd.DataFrame(columns=['Dataset' ,'Model', 'Percentage of data in training', 'Train Accuracy','Test Accuracy'])

In [22]:
def train_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    acc_score_train = accuracy_score(y_train, model.predict(x_train))
    acc_score_test = accuracy_score(y_test, model.predict(x_test))
    return acc_score_train, acc_score_test

In [23]:
def model_performance(dataset, model, name, x_train, y_train, x_test, y_test, percentage_of_data_in_training):
    accuracy_score_train, accuracy_score_test = train_model(model, x_train, y_train, x_test, y_test)
    model_performance_data.loc[len(model_performance_data.index)] = [dataset, name, percentage_of_data_in_training,
                                                                      accuracy_score_train, accuracy_score_test]

In [24]:
data = ['data/CondMat.csv', 'data/GenRel.csv', 'data/ErdosRenyi.csv', 'data/BarabasiAlbert.csv']

percentages = [0.1, 0.3, 0.5, 0.7, 0.9]

for dataset in data:
    for percent in percentages:
        x_train, y_train, x_test, y_test = prep_data(dataset, percent)
        model = GaussianNB()
        model_performance(dataset, model, 'GaussianNB', x_train, y_train, x_test, y_test, percent)
        model = DecisionTreeClassifier(criterion='entropy', random_state=42)
        model_performance(dataset, model, 'DecisionTreeClassifier', x_train, y_train, x_test, y_test, percent)
        model = KNeighborsClassifier()
        model_performance(dataset, model, 'KNeigborsClassifier', x_train, y_train, x_test, y_test, percent)
        model = AdaBoostClassifier(algorithm='SAMME', random_state=42)
        model_performance(dataset, model, 'AdaBoostClassifier', x_train, y_train, x_test, y_test, percent)
        model = BaggingClassifier(random_state=42)
        model_performance(dataset, model, 'BaggingClassifier', x_train, y_train, x_test, y_test, percent)

model_performance_data.to_csv('performance/model_performance_without_ego.csv', index=False)

Nemamo problema sa overfit-om i underfit-om zato što je rezultat predvidjanja sličan na skupu za treniranje i testiranje. 

Modeli rade bolje predviđanje nad veštački generisanim mrežama (Erdos-Renyi i Barabasi Albert), i na većim mrežama zato što veličina skupa podataka za treniranje utiče na tačnost modela što se vidi i na pojedinačnim mrežama što više koristimo za treniranje imaćemo bolje predviđanje.

Modeli bazirani na stablima odlučivanja