# Linear Regression on NBA dataset

This notebook uses linear regression to predict the weight based on the height, based on a dataset of NBA players. First with the classic sklearn library, then with the class defined in src/linear_regression.py


In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as LinearRegressionSklearn
import sys
import time
sys.path.insert(0, '../src')
from linear_regression import LinearRegression as LinearRegressionNumpy


nba_df=pd.read_csv('../data/csv/nba.csv')


In [44]:
nba_df

Unnamed: 0,#,Player,Pos,HT,WT,Age,Current Team,YOS,Pre-Draft Team,Draft Status,Nationality
0,5,Precious Achiuwa,SF,203.2,110.2,25,New York Knicks,4,Memphis,2020 Rnd 1 Pick 20,Nigeria
1,12,Steven Adams,C,210.8,120.2,31,Houston Rockets,11,Pittsburgh,2013 Rnd 1 Pick 12,New Zealand
2,13,Bam Adebayo,C,205.7,115.7,27,Miami Heat,7,Kentucky,2017 Rnd 1 Pick 14,United States
3,30,Ochai Agbaji,SF,195.6,97.5,24,Toronto Raptors,2,Kansas,2022 Rnd 1 Pick 14,United States
4,7,Santi Aldama,C,213.4,97.5,24,Memphis Grizzlies,3,Loyola (MD),2021 Rnd 1 Pick 30,Spain
...,...,...,...,...,...,...,...,...,...,...,...
521,28,Guerschon Yabusele,PF,203.2,120.2,29,Philadelphia Sixers,2,Rouen Metropole Basket (France),2016 Rnd 1 Pick 16,France
522,11,Jahmir Young,PG,185.4,83.9,24,Chicago Bulls,0,Maryland,"2024 NBA Draft, Undrafted",United States
523,11,Trae Young,PG,185.4,74.4,26,Atlanta Hawks,6,Oklahoma,2018 Rnd 1 Pick 5,United States
524,40,Cody Zeller,F,210.8,108.9,32,Houston Rockets,11,Indiana,2013 Rnd 1 Pick 4,United States


## Fit the model on 5 random players to compare values

In [37]:
NB_PLAYERS_TRAIN = 500

random_players = nba_df.sample(n=NB_PLAYERS_TRAIN, random_state=42)

heights_train= np.array(random_players['HT']).reshape(-1,1)/100 
weights_train= np.array(random_players['WT']).reshape(-1,1)/100

print(heights_train,weights_train)

[[1.93 ]
 [2.032]
 [1.956]
 [2.007]
 [2.108]
 [1.88 ]
 [2.134]
 [1.956]
 [1.956]
 [2.083]
 [2.057]
 [1.93 ]
 [1.981]
 [2.007]
 [2.083]
 [2.108]
 [2.057]
 [2.057]
 [1.854]
 [1.93 ]
 [1.93 ]
 [1.981]
 [2.134]
 [2.057]
 [1.88 ]
 [2.134]
 [2.134]
 [1.956]
 [1.905]
 [1.981]
 [2.032]
 [2.007]
 [1.905]
 [1.956]
 [1.93 ]
 [2.083]
 [2.057]
 [2.083]
 [1.829]
 [1.981]
 [1.981]
 [2.032]
 [2.159]
 [2.134]
 [2.032]
 [1.93 ]
 [1.854]
 [2.032]
 [2.057]
 [1.981]
 [1.854]
 [1.905]
 [2.108]
 [2.134]
 [1.956]
 [1.956]
 [1.93 ]
 [1.93 ]
 [1.93 ]
 [2.007]
 [1.854]
 [1.981]
 [1.88 ]
 [1.981]
 [1.93 ]
 [1.905]
 [2.057]
 [2.032]
 [2.083]
 [1.981]
 [2.007]
 [2.007]
 [1.905]
 [2.007]
 [2.007]
 [1.981]
 [2.032]
 [2.108]
 [1.88 ]
 [2.032]
 [2.083]
 [1.956]
 [1.905]
 [1.981]
 [2.032]
 [1.829]
 [1.981]
 [1.93 ]
 [2.057]
 [2.134]
 [2.032]
 [1.956]
 [1.88 ]
 [1.981]
 [2.108]
 [1.93 ]
 [2.032]
 [2.184]
 [2.108]
 [1.956]
 [2.007]
 [2.007]
 [1.88 ]
 [1.88 ]
 [2.032]
 [1.905]
 [2.134]
 [2.083]
 [1.981]
 [1.956]
 [1.981]
 

### With sklearn

In [38]:
lr_sklearn = LinearRegressionSklearn(fit_intercept=True)

lr_sklearn.fit(heights_train, weights_train)
coef_sklearn, intercept_sklearn = lr_sklearn.coef_, lr_sklearn.intercept_
score_sklearn = lr_sklearn.score(heights_train,weights_train)

### With numpy

In [39]:
lr_numpy = LinearRegressionNumpy(fit_intercept=True)

lr_numpy.fit(heights_train, weights_train)
coef_numpy, intercept_numpy = lr_numpy.weights
score_numpy = lr_numpy.r2_score(heights_train,weights_train)

### Compare differences

In [40]:
print(coef_numpy, coef_sklearn)

[0.91241609] [[0.91241609]]


In [41]:
print(intercept_numpy,intercept_sklearn)

[-0.84542756] [-0.84542756]


In [42]:
print(score_numpy,score_sklearn)

0.4898287811742944 0.4898287811742944
