In [30]:
import pandas as pd
nba = pd.read_csv("nba_2013.csv")
print(nba.columns.values)

['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


In [31]:
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']
import math
def euclidean_distance(row):
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

lebron_distance = nba.apply(euclidean_distance, axis=1)
print(lebron_distance)

0      3475.792868
1              NaN
2              NaN
3      1189.554979
4      3216.773098
          ...     
476    1948.158130
477    1851.909840
478     949.668916
479    2699.963932
480    3075.753429
Length: 481, dtype: float64


In [32]:
# Using normalized data to reduce bias of columns with larger values
nba_numeric = nba[distance_columns]
nba_normalized = (nba_numeric - nba_numeric.mean())/nba_numeric.std()

from scipy.spatial import distance
nba_normalized.fillna(0, inplace=True)
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)
print(euclidean_distances)

0      13.131600
1      12.180993
2      11.850978
3       6.473960
4      12.182853
         ...    
476     8.083717
477     8.543626
478     6.254191
479    11.273059
480    11.901991
Length: 481, dtype: float64


In [33]:
import pandas as pd
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]
print(most_similar_to_lebron)

Carmelo Anthony


In [34]:
top_four = distance_frame.iloc[0:4]["idx"]
top4_to_lebron = nba.loc[top_four]["player"]
print(top4_to_lebron)

225       LeBron James
17     Carmelo Anthony
136        Monta Ellis
128       Goran Dragic
Name: player, dtype: object


In [35]:
nba.fillna(nba.mean(), inplace = True)

In [36]:
import random
from numpy.random import permutation

random_indices = permutation(nba.index)
test_cutoff = math.floor(len(nba)/3)
test = nba.loc[random_indices[1:test_cutoff]]
train = nba.loc[random_indices[test_cutoff:]]

x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
y_column = ["pts"]

from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(train[x_columns], train[y_column])
predictions = knn.predict(test[x_columns])

In [37]:
actual = test[y_column]
mse = (((predictions - actual) ** 2).sum()) / len(predictions)
rmse = mse**0.5
print(mse)
print(rmse)

pts    3891.866164
dtype: float64
pts    62.384823
dtype: float64
