In [53]:
import pandas as pd
import math
from scipy.spatial import distance

nba = pd.read_csv("nba_2013.csv")

print(nba.columns.values)

['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


In [54]:
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

def euclidean_distance(row):
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

lebron_distance = nba.apply(euclidean_distance, axis=1)


In [55]:
nba_numeric = nba[distance_columns]
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()
# Fill in the NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)



from scipy.spatial import distance

# Fill in the NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for Lebron James
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]
# Find the distance between Lebron James and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)




In [56]:
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]

In [57]:
distance_frame.head(5)

Unnamed: 0,dist,idx
225,0.0,225
17,4.171854,17
136,4.206786,136
128,4.382582,128
185,4.489928,185


In [58]:
distance_frame.iloc[0:5]

Unnamed: 0,dist,idx
225,0.0,225
17,4.171854,17
136,4.206786,136
128,4.382582,128
185,4.489928,185


In [96]:
import random
from numpy.random import permutation

nba.fillna(0, inplace=True)

# Randomly shuffle the index of nba
random_indices = permutation(nba.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)

test_cutoff = math.floor(len(nba)/3)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices
test = nba.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data
train = nba.loc[random_indices[test_cutoff:]]

In [116]:
random_indices_col = pd.Series(random_indices, index = nba.index)
learnloc = pd.concat([nba[["player","fga"]],random_indices_col],axis=1)

In [117]:
nba["random_indices_col"] =  random_indices 

In [118]:
learnloc[0:1]

Unnamed: 0,player,fga,0
0,Quincy Acy,141,111


In [128]:
nba[0:1]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,trb,ast,stl,blk,tov,pf,pts,season,season_end,random_indices_col
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,216,28,23,26,30,122,171,2013-2014,2013,111


In [107]:
random_indices[0]

111

In [129]:
nba.loc[[0]]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,trb,ast,stl,blk,tov,pf,pts,season,season_end,random_indices_col
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,216,28,23,26,30,122,171,2013-2014,2013,111


In [130]:
nba.iloc[[0]]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,trb,ast,stl,blk,tov,pf,pts,season,season_end,random_indices_col
111,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,216,28,23,26,30,122,171,2013-2014,2013,111


In [131]:
nba.loc[random_indices[[0]]]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,trb,ast,stl,blk,tov,pf,pts,season,season_end,random_indices_col
111,Samuel Dalembert,C,32,DAL,80,68,1614,214,377,0.568,...,541,38,41,94,90,210,529,2013-2014,2013,342


In [132]:
nba.iloc[random_indices[[0]]]

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,trb,ast,stl,blk,tov,pf,pts,season,season_end,random_indices_col
342,Samuel Dalembert,C,32,DAL,80,68,1614,214,377,0.568,...,541,38,41,94,90,210,529,2013-2014,2013,342


In [92]:
# The columns that we'll be using to make predictions
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# The column we want to predict
y_column = ["pts"]

from sklearn.neighbors import KNeighborsRegressor
# Create the kNN model
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data
knn.fit(train[x_columns], train[y_column])
# Make predictions on the test set using the fit model
predictions = knn.predict(test[x_columns])

In [76]:
##KNN Cannot calculate distance with NA value