In [0]:
from google.colab import drive
drive.mount('/content/drive/')

In [0]:
cd drive/My Drive/mln_project

In [0]:
!pip install node2vec

In [0]:
import pandas as pd
import numpy as np
import networkx as nx
import math
from scipy import spatial
import pickle
from sklearn.metrics import accuracy_score
import seaborn as sns
from node2vec import Node2Vec
import random

In [0]:
file2 = 'RegularSeasonCompactResults.csv'
df = pd.read_csv(file2, delimiter = ',', header = None)
df = np.array(df)

In [0]:
print(df[:5])
print(df.shape)

In [0]:
train_graph = nx.DiGraph()
for i in range(1, df.shape[0] - 2000):
  if int(df[i][0]) >= 2015:
    train_graph.add_edge(int(df[i][2]), int(df[i][4]))

In [0]:
print(train_graph.number_of_nodes(), train_graph.number_of_edges())

In [0]:
players_win_map = {}
for i in train_graph.nodes():
  players_win_map[i] = []
print(len(players_win_map))
print(players_win_map)
  

In [0]:
pr = nx.pagerank(train_graph, alpha=0.85)
print('min',pr[min(pr, key=pr.get)])
print('max',pr[max(pr, key=pr.get)])
print('mean',float(sum(pr.values())) / len(pr))

In [0]:
hits = nx.hits(train_graph, max_iter=100, tol=1e-08, nstart=None, normalized=True)
print('min',hits[0][min(hits[0], key=hits[0].get)])
print('max',hits[0][max(hits[0], key=hits[0].get)])
print('mean',float(sum(hits[0].values())) / len(hits[0]))

In [0]:
katz = nx.katz.katz_centrality(train_graph,alpha=0.005,beta=1)
print('min',katz[min(katz, key=katz.get)])
print('max',katz[max(katz, key=katz.get)])
print('mean',float(sum(katz.values())) / len(katz))

In [0]:
node2vec = Node2Vec(train_graph, dimensions = 20, walk_length = 16, num_walks = 50)
model = node2vec.fit(window=7, min_count=1)

In [0]:
def compute_shortest_path_length(a,b):
    p=-1
    try:
        if train_graph.has_edge(a,b):
            train_graph.remove_edge(a,b)
            p= nx.shortest_path_length(train_graph,source=a,target=b)
            train_graph.add_edge(a,b)
        else:
            p= nx.shortest_path_length(train_graph,source=a,target=b)
        return p
    except:
        return -1

def calc_adar_in(a,b):
    sum=0
    try:
        n=list(set(train_graph.successors(a)).intersection(set(train_graph.successors(b))))
        if len(n)!=0:
            for i in n:
                sum=sum+(1/np.log10(len(list(train_graph.predecessors(i)))))
            return sum
        else:
            return 0
    except:
        return 0

def jaccard_for_followees(a,b):
    try:
        if len(set(train_graph.successors(a))) == 0  | len(set(train_graph.successors(b))) == 0:
            return 0
        sim = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
                                    (len(set(train_graph.successors(a)).union(set(train_graph.successors(b)))))
    except:
        return 0
    return sim
#for followers
def jaccard_for_followers(a,b):
    try:
        if len(set(train_graph.predecessors(a))) == 0  | len(set(g.predecessors(b))) == 0:
            return 0
        sim = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
                                 (len(set(train_graph.predecessors(a)).union(set(train_graph.predecessors(b)))))
        return sim
    except:
        return 0

In [0]:
player_pair_history = {}
train_data = []
train_label = []


for i in range(1, df.shape[0] - 2000):
  if int(df[i][0]) < 2015:
    continue
  ind = random.randint(0, 10)
  if ind < 5:
    wh = int(df[i][2])
    bl = int(df[i][4])
  else:
    wh = int(df[i][4])
    bl = int(df[i][2])
  features = []
  # features_neg = []
  wid = int(df[i][2])
  lid = int(df[i][4])
  # add pagerank for white and black
  features.append(pr[wh] - pr[bl])
  # features_neg.append(pr[lid] - pr[wid])
  
  # add katz for white and black
  features.append(katz[wh] - katz[bl])
  # features_neg.append(katz[lid] - katz[wid])
  
  # add hits for white and black
  features.append(hits[0][wh] - hits[0][bl])
  features.append(hits[1][wh] - hits[1][bl])

  # features_neg.append(- hits[0][wid] + hits[0][lid])
  # features_neg.append( - hits[1][wid] + hits[1][lid])  

  #add shortest distance between white and black
  features.append(compute_shortest_path_length(wh, bl))
  # features_neg.append(compute_shortest_path_length(lid, wid))
  
  #add adamic adar between white and black
  features.append(calc_adar_in(wh, bl))
  # features_neg.append(calc_adar_in(lid, wid))

  #jaccard coeff between white and black for players defeated
  features.append(jaccard_for_followees(wh, bl))
  features.append(jaccard_for_followers(wh, bl))

  # features_neg.append(jaccard_for_followees(lid, wid))
  # features_neg.append(jaccard_for_followers(lid, wid))
  
  # white and black similarity
  v1 = model[str(wh)]
  v2 = model[str(bl)]
  features.append(spatial.distance.cosine(v1, v2))




  # non graph related features
 
  #performance of white against black previously
  wh_wins = 0
  bl_wins = 0
  
  if (wh, bl) in player_pair_history:
    wh_wins = player_pair_history[(wh, bl)]
    if wh == wid:
      player_pair_history[(wh, bl)] = wh_wins + 1
  else:
    if wh == wid:
      player_pair_history[(wh, bl)] = 1

  if (bl, wh) in player_pair_history:
    bl_wins = player_pair_history[(bl, wh)]
    if bl == wid:
      player_pair_history[(bl, wh)] = bl_wins + 1
  else:
    if bl == wid:
      player_pair_history[(bl, wh)] = 1
  features.append(wh_wins - bl_wins)
  
  if (train_graph.out_degree(wh) + train_graph.in_degree(wh)) != 0:
    per_win_wh = train_graph.out_degree(wh)/(train_graph.out_degree(wh) + train_graph.in_degree(wh))
  else:
    per_win_wh = 0
  if (train_graph.out_degree(bl) + train_graph.in_degree(bl)) != 0:
    per_win_bl = train_graph.out_degree(bl)/(train_graph.out_degree(bl) + train_graph.in_degree(bl))
  else:
      per_win_bl = 0
  features.append(per_win_wh - per_win_bl)

  #performance of white in last 10 games
  if wh in players_win_map:
    history = players_win_map[wh]
    if len(history) == 0:
      # features.append(0)
      
      x = 0
    else:
      
      if len(history) < 10:
        
        perf = np.count_nonzero(history == 1)
        perf = perf/len(history)
      else:
        perf = np.count_nonzero(history[-10:] == 1)
        perf = perf/10
      
      # features.append(perf)
      x = perf
      if wid == wh:
        history.append(1)
      else:
        history.append(0)
      players_win_map[wh] = history
  else:
    # features.append(0)
    x = 1
    players_win_map[wh] = [1]
  
  
  #performance of black in last 10 games
  if bl in players_win_map:
    history = players_win_map[bl]
    if len(history) == 0:
      # features.append(0)
      # features_neg.append(0)
      y = 0
    else:
      # history = np.array(history)
      perf = np.count_nonzero(history == 1)
      if len(history) < 10:
        perf = np.count_nonzero(history == 1)
        perf = perf/len(history)
        
      else:
        perf = np.count_nonzero(history[-10:] == 1)
        perf = perf/10
      
      # features.append(perf)
      # features_neg.append(perf) 
      y = perf
      if bl == lid:
        history.append(0)
      else:
        history.append(1)
      players_win_map[bl] = history
  else:
    # features.append(0)
    # features_neg.append(0)
    y = 0
    players_win_map[bl] = [0]

  # features_neg.append(x)
  features.append(x - y)

  train_data.append(features)
  # train_data.append(features_neg)
  if int(df[i][2]) == wh:
    train_label.append(1)
  else:
    train_label.append(0)

  

In [0]:
print(len(train_data))
print(len(train_data[1]))
print(len(train_label))

In [0]:
test_data = []
test_label = []


for i in range(df.shape[0] - 2000, df.shape[0]):
  if int(df[i][0]) < 2015:
    continue
  ind = random.randint(0, 10)
  if ind < 5:
    wh = int(df[i][2])
    bl = int(df[i][4])
  else:
    wh = int(df[i][4])
    bl = int(df[i][2])
  features = []
  # features_neg = []
  wid = int(df[i][2])
  lid = int(df[i][4])
  # add pagerank for white and black
  features.append(pr[wh] - pr[bl])
  # features_neg.append(pr[lid] - pr[wid])
  
  # add katz for white and black
  features.append(katz[wh] - katz[bl])
  # features_neg.append(katz[lid] - katz[wid])
  
  # add hits for white and black
  features.append(hits[0][wh] - hits[0][bl])
  features.append(hits[1][wh] - hits[1][bl])

  # features_neg.append(- hits[0][wid] + hits[0][lid])
  # features_neg.append( - hits[1][wid] + hits[1][lid])  

  #add shortest distance between white and black
  features.append(compute_shortest_path_length(wh, bl))
  # features_neg.append(compute_shortest_path_length(lid, wid))
  
  #add adamic adar between white and black
  features.append(calc_adar_in(wh, bl))
  # features_neg.append(calc_adar_in(lid, wid))

  #jaccard coeff between white and black for players defeated
  features.append(jaccard_for_followees(wh, bl))
  features.append(jaccard_for_followers(wh, bl))

  # features_neg.append(jaccard_for_followees(lid, wid))
  # features_neg.append(jaccard_for_followers(lid, wid))
  
  # white and black similarity
  v1 = model[str(wh)]
  v2 = model[str(bl)]
  features.append(spatial.distance.cosine(v1, v2))




  # non graph related features
  #performance of white in last 10 games
  if wh in players_win_map:
    history = players_win_map[wh]
    if len(history) == 0:
      # features.append(0)
      
      x = 0
    else:
      
      if len(history) < 10:
        
        perf = np.count_nonzero(history == 1)
        perf = perf/len(history)
      else:
        perf = np.count_nonzero(history[-10:] == 1)
        perf = perf/10
      
      # features.append(perf)
      x = perf
      if wid == wh:
        history.append(1)
      else:
        history.append(0)
      players_win_map[wh] = history
  else:
    # features.append(0)
    x = 1
    players_win_map[wh] = [1]
  
  
  #performance of black in last 10 games
  if bl in players_win_map:
    history = players_win_map[bl]
    if len(history) == 0:
      # features.append(0)
      # features_neg.append(0)
      y = 0
    else:
      # history = np.array(history)
      perf = np.count_nonzero(history == 1)
      if len(history) < 10:
        perf = np.count_nonzero(history == 1)
        perf = perf/len(history)
        
      else:
        perf = np.count_nonzero(history[-10:] == 1)
        perf = perf/10
      
      # features.append(perf)
      # features_neg.append(perf) 
      y = perf
      if bl == lid:
        history.append(0)
      else:
        history.append(1)
      players_win_map[bl] = history
  else:
    # features.append(0)
    # features_neg.append(0)
    y = 0
    players_win_map[bl] = [0]

  # features_neg.append(x)
  features.append(x - y)
  # features_neg.append(y - x)

  #performance of white against black previously
  wh_wins = 0
  bl_wins = 0
  
  if (wh, bl) in player_pair_history:
    wh_wins = player_pair_history[(wh, bl)]
    if wh == wid:
      player_pair_history[(wh, bl)] = wh_wins + 1
  else:
    if wh == wid:
      player_pair_history[(wh, bl)] = 1

  if (bl, wh) in player_pair_history:
    bl_wins = player_pair_history[(bl, wh)]
    if bl == wid:
      player_pair_history[(bl, wh)] = bl_wins + 1
  else:
    if bl == wid:
      player_pair_history[(bl, wh)] = 1
  features.append(wh_wins - bl_wins)

  if (train_graph.out_degree(wh) + train_graph.in_degree(wh)) != 0:
    per_win_wh = train_graph.out_degree(wh)/(train_graph.out_degree(wh) + train_graph.in_degree(wh))
  else:
    per_win_wh = 0
  if (train_graph.out_degree(bl) + train_graph.in_degree(bl)) != 0:
    per_win_bl = train_graph.out_degree(bl)/(train_graph.out_degree(bl) + train_graph.in_degree(bl))
  else:
      per_win_bl = 0
  features.append(per_win_wh - per_win_bl)

  #performance of white in last 10 games
  if wh in players_win_map:
    history = players_win_map[wh]
    if len(history) == 0:
      # features.append(0)
      
      x = 0
    else:
      
      if len(history) < 10:
        
        perf = np.count_nonzero(history == 1)
        perf = perf/len(history)
      else:
        perf = np.count_nonzero(history[-10:] == 1)
        perf = perf/10
      
      # features.append(perf)
      x = perf
      if wid == wh:
        history.append(1)
      else:
        history.append(0)
      players_win_map[wh] = history
  else:
    # features.append(0)
    x = 1
    players_win_map[wh] = [1]
  
  
  #performance of black in last 10 games
  if bl in players_win_map:
    history = players_win_map[bl]
    if len(history) == 0:
      # features.append(0)
      # features_neg.append(0)
      y = 0
    else:
      # history = np.array(history)
      perf = np.count_nonzero(history == 1)
      if len(history) < 10:
        perf = np.count_nonzero(history == 1)
        perf = perf/len(history)
        
      else:
        perf = np.count_nonzero(history[-10:] == 1)
        perf = perf/10
      
      # features.append(perf)
      # features_neg.append(perf) 
      y = perf
      if bl == lid:
        history.append(0)
      else:
        history.append(1)
      players_win_map[bl] = history
  else:
    # features.append(0)
    # features_neg.append(0)
    y = 0
    players_win_map[bl] = [0]

  # features_neg.append(x)
  features.append(x - y)

  test_data.append(features)
  # test_data.append(features_neg)
  if int(df[i][2]) == wh:
    test_label.append(1)
  else:
    test_label.append(0)

  

In [0]:
print(len(test_data))
print(len(test_data[0]))

In [0]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(train_data)
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)
lr.fit(train_data, train_label)

In [0]:
predictions = lr.predict_proba(test_data)
from sklearn.metrics import accuracy_score
test_pred = lr.predict(test_data)
print(accuracy_score(test_pred, test_label))
print("Precision = {}".format(precision_score(test_label, test_pred, average='weighted')))
from sklearn.metrics import confusion_matrix
cfm = confusion_matrix(test_label, test_pred)
df_cm = pd.DataFrame(cfm, range(2), range(2))
# plt.figure(figsize=(10,7))
sns.set(font_scale = 0.8) # for label size
sns.heatmap(df_cm, annot=True, fmt = "d") # font size
from sklearn.metrics import roc_auc_score
print(roc_auc_score(test_label, predictions[:,1]))

In [0]:
test_label = np.array(test_label)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

fpr_1, tpr_1, _ = roc_curve(test_label, predictions[:,1])
fpr_0, tpr_0, _ = roc_curve(1 - test_label, predictions[:, 0])
plt.figure()
lw = 2
plt.plot(fpr_0, tpr_0, color='red',
         lw=lw, label='Win')
plt.plot(fpr_1, tpr_1, color='blue',
         lw=lw, label='Loss')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")

plt.savefig('roc_BB.png', dpi = 500)
plt.show()

In [0]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth = 8, random_state=0)
clf.fit(train_data, train_label)
print(clf.score(train_data, train_label))


In [0]:

test_pred = clf.predict(test_data)
predictions = clf.predict_proba(test_data)
print(accuracy_score(test_pred, test_label))
print(roc_auc_score(test_label, predictions[:, 1]))
from sklearn.metrics import precision_score
print("Precision = {}".format(precision_score(test_label, test_pred, average='weighted')))

In [0]:
from sklearn.metrics import confusion_matrix
cfm = confusion_matrix(test_label, test_pred)
df_cm = pd.DataFrame(cfm, range(2), range(2))
# plt.figure(figsize=(10,7))
sns.set(font_scale = 0.8) # for label size
sns.heatmap(df_cm, annot=True, fmt = "d") # font size
plt.savefig('cm_BB.png', dpi = 500)

In [0]:
train_data = np.array(train_data)
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
feature_name = ["Page Rank Diff", "Katz Centrality Diff", "Hub Score Diff", "AUthority Score Diff", "Shortest Path", "Adamic Adar", "JC incoming", "JC outgoing" , "Node2vec similarity","overall winning %", "% wins against opponent", "% wins in recent(10) games diff"]
for f in range(train_data.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]) + " " + feature_name[indices[f]])

# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(train_data.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(train_data.shape[1]), indices)
plt.xlim([-1, train_data.shape[1]])
plt.show()

In [0]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1, max_iter=300).fit(train_data, train_label)
print(clf.score(train_data, train_label))
predictions = clf.predict_proba(test_data)
print(roc_auc_score(test_label, predictions[:,1], average = "macro", multi_class='ovr'))
print(accuracy_score(test_pred, test_label))
print("Precision = {}".format(precision_score(test_label, test_pred, average='weighted')))

In [0]:
import xgboost as xgb
param = {
    'eta': 0.3, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 2} 

steps = 20 
train_d = xgb.DMatrix(train_data, label = train_label)
test_d = xgb.DMatrix(test_data, label= test_label)
model = xgb.train(param, train_d, steps)
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score

preds = model.predict(test_d)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision = {}".format(precision_score(test_label, best_preds, average='weighted')))
print("Recall = {}".format(recall_score(test_label, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(test_label, best_preds)))
print(roc_auc_score(test_label, preds[:,1], average = "macro", multi_class='ovr'))

In [0]:
from sklearn.metrics import confusion_matrix
cfm = confusion_matrix(test_label, best_preds)
df_cm = pd.DataFrame(cfm, range(2), range(2))
# plt.figure(figsize=(10,7))
sns.set(font_scale = 0.8) # for label size
sns.heatmap(df_cm, annot=True, fmt = "d") # font size


In [0]:
def degree_distribution(G):
    degree_sequence = sorted([d for n, d in G.degree()], reverse=True)
    #Degree of nodes in decreasing manner
    # print ("Degree sequence", degree_sequence)
    dmax = max(degree_sequence)
    x, y = np.unique(degree_sequence, return_counts=True)
    #counting the frequency of nodes having a particular degree 
    x = x[1:,]
    y = y[1:,]
    # print(x[0:100])

    x = np.log10(x)
    y = y/G.number_of_nodes()
    y = np.log10(y)


    # print(x.shape, y.shape)
    x = x.reshape(-1, 1)
    # print(x)
    # print(y)
#     model = LinearRegression()
#     model.fit(x, y)
#     m = model.coef_
    # print(m)
    # print(c)
#     c = model.intercept_
#     fx = m * x + c
    return x, y

In [0]:
import matplotlib.pyplot as plt
x, y = degree_distribution(train_graph)


plt1, = plt.plot(x, y, 'o', label = 'BasketBall Game')
plt.legend(handles=[plt1])
plt.title("Degree distribution in loglog scale")
plt.ylabel("Fraction of Nodes")
plt.xlabel("Degree")
plt.show()

In [0]:
degree_sequence = sorted([d for n, d in train_graph.out_degree()], reverse=True)
# print "Degree sequence", degree_sequence
dmax = max(degree_sequence)
print(dmax)

In [0]:
degree_sequence = sorted([d for n, d in train_graph.in_degree()], reverse=True)
# print "Degree sequence", degree_sequence
dmax = max(degree_sequence)
print(dmax)

In [0]:
print(nx.average_clustering(train_graph))

In [0]:
import matplotlib.pyplot as plt
import networkx as nx

def plot_degree_dist(G):
    degrees = [G.degree(n) for n in G.nodes()]
    plt.hist(degrees)
    plt.xlabel('Degree')
    plt.ylabel('Number of Nodes')
    plt.title('Degree Distribution')
    plt.savefig('dd_BB.png', dpi = 500)
    plt.show()

plot_degree_dist(train_graph)