In [None]:
"""
Problem Set:
In this assignment, students will be using the K-nearest neighbors algorithm to predict
how many points NBA players scored in the 2013-2014 season.

A look at the data

Before we dive into the algorithm, let’s take a look at our data. Each row in the data
contains information on how a player performed in the 2013-2014 NBA season.

Download 'nba_2013.csv' file from this link:
https://www.dropbox.com/s/b3nv38jjo5dxcl6/nba_2013.csv?dl=0

Here are some selected columns from the data:

player - name of the player
pos - the position of the player
g - number of games the player was in
gs - number of games the player started
pts - total points the player scored

There are many more columns in the data, mostly containing information about average
player game performance over the course of the season. See this site for an explanation
of the rest of them.

We can read our dataset in and figure out which columns are present:

import pandas
with open("nba_2013.csv", 'r') as csvfile:
nba = pandas.read_csv(csvfile)


"""

In [1]:
import numpy as np
import pandas as pd
with open('nba_2013.csv','r') as csvfile:
    nba=pd.read_csv(csvfile)

In [2]:
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [4]:
nba.shape

(481, 31)

In [21]:
nba.columns.values

array(['player', 'pos', 'age', 'bref_team_id', 'g', 'gs', 'mp', 'fg',
       'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.',
       'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk',
       'tov', 'pf', 'pts', 'season', 'season_end'], dtype=object)

In [3]:
nba.isnull().sum()

player           0
pos              0
age              0
bref_team_id     0
g                0
gs               0
mp               0
fg               0
fga              0
fg.              2
x3p              0
x3pa             0
x3p.            67
x2p              0
x2pa             0
x2p.             3
efg.             2
ft               0
fta              0
ft.             20
orb              0
drb              0
trb              0
ast              0
stl              0
blk              0
tov              0
pf               0
pts              0
season           0
season_end       0
dtype: int64

In [23]:
# Replace Nan with zeros
nba=nba.fillna(0)

In [24]:
# check if any null left.
nba.isnull().sum()

player          0
pos             0
age             0
bref_team_id    0
g               0
gs              0
mp              0
fg              0
fga             0
fg.             0
x3p             0
x3pa            0
x3p.            0
x2p             0
x2pa            0
x2p.            0
efg.            0
ft              0
fta             0
ft.             0
orb             0
drb             0
trb             0
ast             0
stl             0
blk             0
tov             0
pf              0
pts             0
season          0
season_end      0
dtype: int64

In [25]:
# convert strings to NaN and drop.
nba= nba.convert_objects(convert_numeric = True).dropna()

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  


In [27]:
# The names of the columns in data
print('nba.columns.values:',nba.columns.values)

nba.columns.values: ['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


In [29]:
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score

In [40]:
X = nba[['age','g', 'gs', 'mp' ,'fg', 'fga' ,'fg.', 'x3p', 'x3pa' ,'x3p.', 'x2p' ,'x2pa', 'x2p.', 'efg.', 'ft',
        'fta', 'ft.' ,'orb' ,'drb', 'trb' ,'ast', 'stl' ,'blk', 'tov', 'pf' ]]
print(X.head())
y=nba['pts']


   age   g  gs    mp   fg   fga    fg.  x3p  x3pa      x3p. ...   fta    ft.  \
0   23  63   0   847   66   141  0.468    4    15  0.266667 ...    53  0.660   
1   20  81  20  1197   93   185  0.503    0     0  0.000000 ...   136  0.581   
2   27  53  12   961  143   275  0.520    0     0  0.000000 ...   119  0.639   
3   28  73  73  2552  464  1011  0.459  128   300  0.426667 ...   336  0.815   
4   25  56  30   951  136   249  0.546    0     1  0.000000 ...    67  0.836   

   orb  drb  trb  ast  stl  blk  tov   pf  
0   72  144  216   28   23   26   30  122  
1  142  190  332   43   40   57   71  203  
2  102  204  306   38   24   36   39  108  
3   32  230  262  248   35    3  146  136  
4   94  183  277   40   23   46   63  187  

[5 rows x 25 columns]


In [41]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=46)

In [61]:
from sklearn.neighbors import KNeighborsRegressor

knn=KNeighborsRegressor(n_neighbors=7)
knn.fit(X_train,y_train)
y_predict=knn.predict(X_test)
print(y_predict[:5])

[ 349.28571429 1003.         1020.71428571  555.28571429  979.42857143]
