In [1]:
import numpy as np
import pandas as pd

treino_path = 'nba_treino.csv'
teste_path = 'nba_teste.csv'

#Utils

In [2]:
def readAndStandardizeInput(file):
  df = pd.read_csv(file)
  # Standardization (mean 0 std 1)
  df.loc[:,'GP':'TOV'] = (df.loc[:,'GP':'TOV'] - df.loc[:,'GP':'TOV'].mean()) / df.loc[:,'GP':'TOV'].std()
  return df

In [3]:
def readInputFromFileList(files):
  dfs = []
  for file in files:
    df = pd.read_csv(file)
    dfs.append(df)
  df = pd.concat(dfs)
  df = df.reset_index(drop=True)

  return df

In [4]:
def _removeOutliers(df, nstds = 3):
  mean = df.loc[:,'GP':'TOV'].mean()
  std = df.loc[:,'GP':'TOV'].std()
  minLimit = mean - nstds*std
  maxLimit = mean + nstds*std
  maxOutlier = (df.loc[:,'GP':'TOV'] > maxLimit).sum(axis=1)
  minOutlier = (df.loc[:,'GP':'TOV'] < minLimit).sum(axis=1)
  outliers = maxOutlier + minOutlier
  return df[outliers < 1].copy()

In [5]:
def treatData(df, standardize = True, removeOutliers = True, ndesv = 3):
  df_copy = df.copy()
  if removeOutliers:
   df_copy = _removeOutliers(df_copy)

  if(standardize):
    # Standardization (mean 0 std 1)
    df_copy.loc[:,'GP':'TOV'] = (df_copy.loc[:,'GP':'TOV'] - df_copy.loc[:,'GP':'TOV'].mean()) / df_copy.loc[:,'GP':'TOV'].std()
  return df_copy

In [6]:
def calculateOneToManyEuclidianDistances(arr, arrs):
  return np.linalg.norm(arrs - arr, axis=1)

# KNN

In [7]:
def knnClassifier(train, instance, k):
  predicted_instance = instance.loc['GP':'TOV']
  distances = calculateOneToManyEuclidianDistances(predicted_instance, train.loc[:,'GP':'TOV'])
  sorting = np.argsort(distances)
  neighbors = train.loc[:,'TARGET_5Yrs'][sorting]
  kneighbors = neighbors[:k]
  if(kneighbors.mean() > 0.5):
    return 1
  else:
    return 0

In [8]:
def knnPredictMany(train, test, k):
  predicted = []
  for i in range(test.shape[0]):
    predicted.append(knnClassifier(train_df, test.loc[i,'GP':'TOV'], k))
  predicted = np.array(predicted)
  return predicted

In [9]:
from sklearn.neighbors import KNeighborsClassifier
def knnScikitPredictMany(train, test, k):
  neigh = KNeighborsClassifier(n_neighbors=k)
  neigh.fit(np.array(train.loc[:,'GP':'TOV']), np.array(train.loc[:,'TARGET_5Yrs']))
  return neigh.predict(np.array(test.loc[:,'GP':'TOV']))

In [10]:
def evaluate(pred, correct):
  TP = 0
  TN = 0
  FP = 0
  FN = 0
  for i in range(len(pred)):
    if(correct[i] == 1):
      if(pred[i] == 0):
        FN = FN + 1
      if(pred[i] == 1):
        TP = TP + 1
    if(correct[i] == 0):
      if(pred[i] == 0):
        TN = TN + 1
      if(pred[i] == 1):
        FP = FP + 1
  return (TP,TN,FP,FN)

In [11]:
def printMetricsAndConfusionMatrix(predicted, actual):
  (TP,TN,FP,FN) = evaluate(predicted, actual)
  data = {'1': [TP, FN],
        '0': [FP , TN]}
  confusion_matrix = pd.DataFrame(data, index=[1, 0])
  confusion_matrix.columns.name = 'Actual'
  confusion_matrix.index.name = 'Predicted'
  accuracy =  (TP + TN)/(TP + FP + TN + FN)
  recall = TP/(TP+TN)
  precision = TP/(TP+FP)
  f1 = (2*precision*recall)/(precision+recall)
  print(confusion_matrix)
  print("Accuracy -", accuracy)
  print("Recall -", recall)
  print("Precision -", precision)
  print("F1 -", f1)

In [12]:
print("KNN")

KNN


In [13]:
train_df = readAndStandardizeInput(treino_path)
test_df = readAndStandardizeInput(teste_path)
nTestInstances = test_df.shape[0]

In [14]:
actual = np.array(test_df.loc[:,'TARGET_5Yrs'])

In [15]:
kValues = [2, 10, 50, 80]

In [16]:
print("Implemententado")
for k in kValues:
  print('k -',k)
  predicted = knnPredictMany(train_df,test_df,k)
  printMetricsAndConfusionMatrix(predicted,actual)
  print()

Implemententado
k - 2
Actual      1   0
Predicted        
1          84  27
0          84  73
Accuracy - 0.585820895522388
Recall - 0.535031847133758
Precision - 0.7567567567567568
F1 - 0.626865671641791

k - 10
Actual       1   0
Predicted         
1          125  44
0           43  56
Accuracy - 0.6753731343283582
Recall - 0.6906077348066298
Precision - 0.7396449704142012
F1 - 0.7142857142857143

k - 50
Actual       1   0
Predicted         
1          128  52
0           40  48
Accuracy - 0.6567164179104478
Recall - 0.7272727272727273
Precision - 0.7111111111111111
F1 - 0.7191011235955056

k - 80
Actual       1   0
Predicted         
1          127  49
0           41  51
Accuracy - 0.664179104477612
Recall - 0.7134831460674157
Precision - 0.7215909090909091
F1 - 0.7175141242937854



In [17]:
print("__________________________________")
print("Scikit Learn")
for k in kValues:
  print('k -',k)
  predicted_scikit = knnScikitPredictMany(train_df,test_df,k)
  printMetricsAndConfusionMatrix(predicted_scikit,actual)
  print()

__________________________________
Scikit Learn
k - 2
Actual      1   0
Predicted        
1          84  27
0          84  73
Accuracy - 0.585820895522388
Recall - 0.535031847133758
Precision - 0.7567567567567568
F1 - 0.626865671641791

k - 10
Actual       1   0
Predicted         
1          125  44
0           43  56
Accuracy - 0.6753731343283582
Recall - 0.6906077348066298
Precision - 0.7396449704142012
F1 - 0.7142857142857143

k - 50
Actual       1   0
Predicted         
1          128  52
0           40  48
Accuracy - 0.6567164179104478
Recall - 0.7272727272727273
Precision - 0.7111111111111111
F1 - 0.7191011235955056

k - 80
Actual       1   0
Predicted         
1          127  49
0           41  51
Accuracy - 0.664179104477612
Recall - 0.7134831460674157
Precision - 0.7215909090909091
F1 - 0.7175141242937854



# K-Means


In [18]:
print("________________________________________________________")
print("K-means")

________________________________________________________
K-means


In [19]:
def KMeansClassifyCluster(k, nrows, centroids, all_dataset_df_without_predicted):
  distances = np.zeros((k, nrows))
  for centroid_i, centroid in enumerate(centroids):
    distances[centroid_i,:] = calculateOneToManyEuclidianDistances(centroid, all_dataset_df_without_predicted)
  classification = np.argmin(distances, axis=0)
  return classification

In [20]:
def KMeansCluster(k,all_dataset_df):
  all_dataset_df_without_predicted = all_dataset_df.loc[:,'GP':'TOV'].copy()
  nrows = all_dataset_df_without_predicted.shape[0]
  centroids = rng.choice(all_dataset_df_without_predicted, size=k, replace=False)
  classification = None
  new_classification = KMeansClassifyCluster(k, nrows, centroids, all_dataset_df_without_predicted)

  while not(np.array_equal(classification, new_classification)):
    classification = new_classification
    for centroid_i in range(k):
      indexes = np.where(classification == centroid_i)[0]
      centroid = all_dataset_df_without_predicted.iloc[indexes].mean().values
      centroids[centroid_i,:] = centroid
    new_classification = KMeansClassifyCluster(k, nrows, centroids, all_dataset_df_without_predicted)
  return centroids

In [21]:
def getMetrics(TP, TN, FP, FN):
  accuracy =  (TP + TN)/(TP + FP + TN + FN)
  recall = TP/(TP+TN)
  precision = TP/(TP+FP)
  f1 = (2*precision*recall)/(precision+recall)
  return (accuracy, recall, precision, f1)

In [22]:
def evaluateKMeans(centroids, all_dataset_df, k):
  ans = all_dataset_df.loc[:,'TARGET_5Yrs']
  associated = np.zeros([k])
  total = all_dataset_df.shape[0]

  all_dataset_df_without_predicted = all_dataset_df.loc[:,'GP':'TOV'].copy()
  nrows = all_dataset_df.shape[0]
  classification = KMeansClassifyCluster(k, nrows, centroids, all_dataset_df_without_predicted)

  TP, TN, FP, FN = 0, 0, 0, 0
  print("count - most frequent - precision")
  correct = 0
  for centroid_i in range(k):
      indexes = np.where(classification == centroid_i)[0]
      associated[centroid_i] = np.round(ans.iloc[indexes].mean())
      current_total = indexes.shape[0]
      current_correct = (ans.iloc[indexes] == associated[centroid_i]).sum()
      correct = correct + current_correct
      print(indexes.shape[0], '-', associated[centroid_i], '-', current_correct/current_total)
      if(associated[centroid_i] == 1):
        TP = TP + current_correct
        FP = FP + current_total - current_correct
      else:
        TN = TN + current_correct
        FN = FN + current_total - current_correct
  print("Summary")

  (accuracy, recall, precision, f1) =getMetrics(TP, TN, FP, FN)
  print("Accuracy -", accuracy)
  print("Recall -", recall)
  print("Precision -", precision)
  print("F1 -", f1)

In [23]:
print("Implemententado")
all_dataset_df = readInputFromFileList([treino_path,teste_path])
k = 2
standardize = False
removeOutliers = True
ndesv = 3
df = treatData(all_dataset_df,standardize = standardize, removeOutliers = removeOutliers, ndesv = ndesv)
rng = np.random.default_rng(seed = 150)
centroids = KMeansCluster(k, df)
df_std = treatData(all_dataset_df,standardize = standardize, removeOutliers = False)
print("k -",k)
evaluateKMeans(centroids, df_std, k)

Implemententado
k - 2
count - most frequent - precision
589 - 0.0 - 0.5704584040747029
751 - 1.0 - 0.7696404793608522
Summary
Accuracy - 0.682089552238806
Recall - 0.6323851203501094
Precision - 0.7696404793608522
F1 - 0.6942942942942943


In [24]:
from sklearn.cluster import KMeans

In [25]:
print("__________________________________")
print("Scikit Learn")
all_dataset_df = readInputFromFileList([treino_path,teste_path])
k = 2
standardize = False
removeOutliers = True
ndesv = 3
df = treatData(all_dataset_df,standardize = standardize, removeOutliers = removeOutliers, ndesv = ndesv)
kmeans = KMeans(n_clusters=k, random_state=0).fit(df.loc[:,'GP':'TOV'])
scikit_centroids = kmeans.cluster_centers_
df_std = treatData(all_dataset_df,standardize = standardize, removeOutliers = False)
print("k -",k)

evaluateKMeans(scikit_centroids, df_std, k)

__________________________________
Scikit Learn




k - 2
count - most frequent - precision
751 - 1.0 - 0.7696404793608522
589 - 0.0 - 0.5704584040747029
Summary
Accuracy - 0.682089552238806
Recall - 0.6323851203501094
Precision - 0.7696404793608522
F1 - 0.6942942942942943


In [26]:
print("_________________________________________________")
print("distancias centroides")
print(np.linalg.norm(centroids[0] - scikit_centroids[1]))
print(np.linalg.norm(centroids[1] - scikit_centroids[0]))
print("_________________________________________________")

_________________________________________________
distancias centroides
3.04837100093636e-14
2.9765949421506847e-14
_________________________________________________


In [27]:
print("Implemententado")
all_dataset_df = readInputFromFileList([treino_path,teste_path])
k = 3
standardize = False
removeOutliers = True
ndesv = 3
df = treatData(all_dataset_df,standardize = standardize, removeOutliers = removeOutliers, ndesv = ndesv)
rng = np.random.default_rng(seed = 150)
centroids = KMeansCluster(k, df)
df_std = treatData(all_dataset_df,standardize = standardize, removeOutliers = False)
print("k -",k)
evaluateKMeans(centroids, df_std, k)

Implemententado
k - 3
count - most frequent - precision
510 - 1.0 - 0.7666666666666667
364 - 0.0 - 0.6236263736263736
466 - 1.0 - 0.6502145922746781
Summary
Accuracy - 0.6873134328358209
Recall - 0.753528773072747
Precision - 0.7110655737704918
F1 - 0.731681602530311


In [28]:
print("__________________________________")
print("Scikit Learn")
all_dataset_df = readInputFromFileList([treino_path,teste_path])
k = 3
standardize = False
removeOutliers = True
ndesv = 3
df = treatData(all_dataset_df,standardize = standardize, removeOutliers = removeOutliers, ndesv = ndesv)
kmeans = KMeans(n_clusters=k, random_state=0).fit(df.loc[:,'GP':'TOV'])
scikit_centroids = kmeans.cluster_centers_
df_std = treatData(all_dataset_df,standardize = standardize, removeOutliers = False)
print("k -",k)
evaluateKMeans(scikit_centroids, df_std, k)

__________________________________
Scikit Learn




k - 3
count - most frequent - precision
509 - 1.0 - 0.768172888015717
365 - 0.0 - 0.6246575342465753
466 - 1.0 - 0.6502145922746781
Summary
Accuracy - 0.6880597014925374
Recall - 0.7527114967462039
Precision - 0.7117948717948718
F1 - 0.731681602530311


In [29]:
print("_________________________________________________")
print("distancias centroides")
print(np.linalg.norm(centroids[0] - scikit_centroids[0]))
print(np.linalg.norm(centroids[1] - scikit_centroids[1]))
print(np.linalg.norm(centroids[2] - scikit_centroids[2]))
print("_________________________________________________")

_________________________________________________
distancias centroides
0.0482444070817285
0.055375801841407855
2.5453735000760633e-14
_________________________________________________
