In [2]:
import pandas as pd
import math
from scipy.spatial import distance

nba = pd.read_csv("nba_2013.csv")

print(nba.columns.values)

['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


In [3]:
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

def euclidean_distance(row):
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

lebron_distance = nba.apply(euclidean_distance, axis=1)


In [4]:
nba_numeric = nba[distance_columns]
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()
# Fill in the NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)



from scipy.spatial import distance

# Fill in the NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for Lebron James
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]
# Find the distance between Lebron James and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)




In [5]:
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]

In [6]:
distance_frame.head(5)

Unnamed: 0,dist,idx
225,0.0,225
17,4.171854,17
136,4.206786,136
128,4.382582,128
185,4.489928,185


In [7]:
distance_frame.iloc[0:5]

Unnamed: 0,dist,idx
225,0.0,225
17,4.171854,17
136,4.206786,136
128,4.382582,128
185,4.489928,185


In [8]:
nba.loc[15:17]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
15,Ryan Anderson,PF,25,NOP,22,14,795,155,354,0.438,...,76,142,17,10,7,20,47,436,2013-2014,2013
16,Giannis Antetokounmpo,SF,19,MIL,77,23,1897,173,418,0.414,...,261,339,150,60,61,122,173,525,2013-2014,2013
17,Carmelo Anthony,PF,29,NYK,77,77,2982,743,1643,0.452,...,477,622,242,95,51,198,224,2112,2013-2014,2013


In [9]:
nba.iloc[15:17]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
15,Ryan Anderson,PF,25,NOP,22,14,795,155,354,0.438,...,76,142,17,10,7,20,47,436,2013-2014,2013
16,Giannis Antetokounmpo,SF,19,MIL,77,23,1897,173,418,0.414,...,261,339,150,60,61,122,173,525,2013-2014,2013


In [32]:
import random
from numpy.random import permutation

nba.fillna(0, inplace=True)
# Randomly shuffle the index of nba
random_indices = permutation(nba.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(nba)/3)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices
test = nba.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data
train = nba.loc[random_indices[test_cutoff:]]

In [33]:
random_indices[1:5]

array([191, 348,  62, 381])

In [40]:
nba[0:4]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013


In [41]:
nba.loc[[0,1,2,3]]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013


In [42]:
nba.iloc[[0,1,2,3]]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
308,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
191,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
348,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
62,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013


In [44]:
random_indices[[0,1,2,3]]

array([308, 191, 348,  62])

In [46]:
nba.loc[random_indices[[0,1,2,3]]]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
308,Tony Mitchell,PF,21,DET,21,0,79,5,12,0.417,...,11,26,2,6,3,4,9,22,2013-2014,2013
191,Manny Harris,SG,24,LAL,9,0,180,28,70,0.4,...,27,34,11,4,1,9,10,73,2013-2014,2013
348,Nikola Pekovic,C,28,MIN,54,54,1663,379,701,0.541,...,262,468,50,30,23,84,129,944,2013-2014,2013
62,Corey Brewer,SF,27,MIN,81,81,2609,388,807,0.481,...,144,207,135,151,30,105,210,998,2013-2014,2013


In [47]:
nba.iloc[random_indices[[0,1,2,3]]]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
380,Tony Mitchell,PF,21,DET,21,0,79,5,12,0.417,...,11,26,2,6,3,4,9,22,2013-2014,2013
326,Manny Harris,SG,24,LAL,9,0,180,28,70,0.4,...,27,34,11,4,1,9,10,73,2013-2014,2013
234,Nikola Pekovic,C,28,MIN,54,54,1663,379,701,0.541,...,262,468,50,30,23,84,129,944,2013-2014,2013
10,Corey Brewer,SF,27,MIN,81,81,2609,388,807,0.481,...,144,207,135,151,30,105,210,998,2013-2014,2013


In [37]:
# The columns that we'll be using to make predictions
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# The column we want to predict
y_column = ["pts"]

from sklearn.neighbors import KNeighborsRegressor
# Create the kNN model
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data
knn.fit(train[x_columns], train[y_column])
# Make predictions on the test set using the fit model
predictions = knn.predict(test[x_columns])

In [39]:
##KNN Cannot calculate distance with NA value