In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

pd.set_option('display.float_format', '{:.2f}'.format)

from sklearn.model_selection import train_test_split as tts   # el alias es cosa
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler
pd.options.mode.copy_on_write = True

In [3]:
df = pd.read_csv("../NBA_csv/StatsAlltrain2.csv")

df['PER'] = (df['PTS'] + (df['AST'] + df['TRB'] + df['STL'] + df['BLK'] - df['TOV']) / df['G']) / (df['MP'] / 48)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2237 entries, 0 to 2236
Data columns (total 31 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Player   2237 non-null   object 
 1   Salario  2237 non-null   int64  
 2   Pos      2237 non-null   object 
 3   Age      2237 non-null   int64  
 4   Tm       2237 non-null   object 
 5   G        2237 non-null   int64  
 6   GS       2237 non-null   int64  
 7   MP       2237 non-null   int64  
 8   FG       2237 non-null   int64  
 9   FGA      2237 non-null   int64  
 10  FG%      2237 non-null   float64
 11  3P       2237 non-null   int64  
 12  3PA      2237 non-null   int64  
 13  3P%      2237 non-null   float64
 14  2P       2237 non-null   int64  
 15  2PA      2237 non-null   int64  
 16  2P%      2237 non-null   float64
 17  eFG%     2237 non-null   float64
 18  FT       2237 non-null   int64  
 19  FTA      2237 non-null   int64  
 20  FT%      2237 non-null   float64
 21  ORB      2237 

In [5]:
stats = df.drop(columns = ["Tm","FG","FGA","FG%","3PA","3P","3P%","2P","2PA","2P%","eFG%","FT","FTA","FT%","ORB","DRB","BLK","PF"])
stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2237 entries, 0 to 2236
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Player   2237 non-null   object 
 1   Salario  2237 non-null   int64  
 2   Pos      2237 non-null   object 
 3   Age      2237 non-null   int64  
 4   G        2237 non-null   int64  
 5   GS       2237 non-null   int64  
 6   MP       2237 non-null   int64  
 7   TRB      2237 non-null   int64  
 8   AST      2237 non-null   int64  
 9   STL      2237 non-null   int64  
 10  TOV      2237 non-null   int64  
 11  PTS      2237 non-null   int64  
 12  PER      2237 non-null   float64
dtypes: float64(1), int64(10), object(2)
memory usage: 227.3+ KB


In [6]:
stats = stats.sort_values(by=["Player", "Age"], ascending = True)

In [7]:
stats['Temporada'] = stats.groupby('Player').cumcount() + 1

stats.sort_values(by=["Player", "Age"], ascending = True)

Unnamed: 0,Player,Salario,Pos,Age,G,GS,MP,TRB,AST,STL,TOV,PTS,PER,Temporada
1965,A.J. Green,1801769,ES,24,42,0,401,45,26,3,7,182,21.98,1
1738,AJ Griffin,3536160,AL,19,72,12,1401,153,73,42,42,639,22.01,1
272,Aaron Gordon,19863636,AP,24,62,62,2017,475,228,51,100,894,21.54,1
676,Aaron Gordon,18136364,AP,25,50,50,1384,284,161,33,97,618,21.72,2
913,Aaron Gordon,16409091,AP,26,75,75,2376,439,188,44,133,1126,22.9,3
1612,Aaron Gordon,19690909,AP,27,68,68,2055,446,203,54,98,1109,26.13,4
1961,Aaron Gordon,21266182,AP,28,58,58,1822,375,188,50,85,800,21.33,5
310,Aaron Holiday,2239200,BA,23,66,33,1617,156,225,55,88,627,18.78,1
765,Aaron Holiday,2345640,BA,24,66,8,1176,89,123,46,66,475,19.51,2
914,Aaron Holiday,3980551,BA,25,63,15,1021,122,153,42,67,400,19.0,3


In [8]:
stats.shape

(2237, 14)

In [9]:
stats = stats[stats["Temporada"] <=5]

In [10]:
stats_t = stats.pivot_table(index='Player', columns='Temporada', values=['Salario', 'Pos', 'Age', 'G', 'GS', 'MP', 'TRB', 'AST', 'STL', 'TOV', 'PTS', 'PER'], aggfunc='first')

stats_t.columns = ['{}_{}'.format(col[0], col[1]) for col in stats_t.columns]

In [11]:
stats_t.reset_index(inplace=True)

In [12]:
stats_t

Unnamed: 0,Player,AST_1,AST_2,AST_3,AST_4,AST_5,Age_1,Age_2,Age_3,Age_4,Age_5,G_1,G_2,G_3,G_4,G_5,GS_1,GS_2,GS_3,GS_4,GS_5,MP_1,MP_2,MP_3,MP_4,MP_5,PER_1,PER_2,PER_3,PER_4,PER_5,PTS_1,PTS_2,PTS_3,PTS_4,PTS_5,Pos_1,Pos_2,Pos_3,Pos_4,Pos_5,STL_1,STL_2,STL_3,STL_4,STL_5,Salario_1,Salario_2,Salario_3,Salario_4,Salario_5,TOV_1,TOV_2,TOV_3,TOV_4,TOV_5,TRB_1,TRB_2,TRB_3,TRB_4,TRB_5
0,A.J. Green,26.0,,,,,24.0,,,,,42.0,,,,,0.0,,,,,401.0,,,,,21.98,,,,,182.0,,,,,ES,,,,,3.0,,,,,1801769.0,,,,,7.0,,,,,45.0,,,,
1,AJ Griffin,73.0,,,,,19.0,,,,,72.0,,,,,12.0,,,,,1401.0,,,,,22.01,,,,,639.0,,,,,AL,,,,,42.0,,,,,3536160.0,,,,,42.0,,,,,153.0,,,,
2,Aaron Gordon,228.0,161.0,188.0,203.0,188.0,24.0,25.0,26.0,27.0,28.0,62.0,50.0,75.0,68.0,58.0,62.0,50.0,75.0,68.0,58.0,2017.0,1384.0,2376.0,2055.0,1822.0,21.54,21.72,22.9,26.13,21.33,894.0,618.0,1126.0,1109.0,800.0,AP,AP,AP,AP,AP,51.0,33.0,44.0,54.0,50.0,19863636.0,18136364.0,16409091.0,19690909.0,21266182.0,100.0,97.0,133.0,98.0,85.0,475.0,284.0,439.0,446.0,375.0
3,Aaron Holiday,225.0,123.0,153.0,89.0,108.0,23.0,24.0,25.0,26.0,27.0,66.0,66.0,63.0,63.0,60.0,33.0,8.0,15.0,6.0,1.0,1617.0,1176.0,1021.0,845.0,1001.0,18.78,19.51,19.0,14.19,19.44,627.0,475.0,400.0,247.0,402.0,BA,BA,BA,BA,BA,55.0,46.0,42.0,37.0,32.0,2239200.0,2345640.0,3980551.0,1836090.0,2346614.0,88.0,66.0,67.0,36.0,45.0,156.0,89.0,122.0,74.0,100.0
4,Aaron Nesmith,23.0,22.0,98.0,80.0,,21.0,22.0,23.0,24.0,,46.0,52.0,73.0,55.0,,1.0,3.0,60.0,30.0,,669.0,574.0,1816.0,1481.0,,15.88,16.56,19.65,21.94,,218.0,196.0,738.0,671.0,,AL,AL,AL,AL,,15.0,20.0,55.0,55.0,,3458400.0,3631200.0,3804360.0,5634257.0,,23.0,31.0,75.0,52.0,,127.0,89.0,277.0,207.0,
5,Aaron Wiggins,68.0,80.0,52.0,,,23.0,24.0,25.0,,,50.0,70.0,60.0,,,35.0,14.0,0.0,,,1209.0,1297.0,878.0,,,16.7,17.88,20.58,,,416.0,479.0,373.0,,,ES,ES,ES,,,30.0,39.0,34.0,,,1000000.0,1563518.0,1836096.0,,,54.0,55.0,36.0,,,178.0,210.0,141.0,,
6,Abdel Nader,38.0,19.0,7.0,,,26.0,27.0,28.0,,,55.0,24.0,14.0,,,6.0,0.0,0.0,,,867.0,355.0,145.0,,,19.24,22.09,12.08,,,345.0,160.0,34.0,,,AL,AL,AL,,,23.0,10.0,8.0,,,1618520.0,1752950.0,2000000.0,,,43.0,19.0,11.0,,,100.0,62.0,27.0,,
7,Admiral Schofield,15.0,25.0,,,,22.0,24.0,,,,33.0,38.0,,,,2.0,1.0,,,,368.0,469.0,,,,13.18,15.22,,,,99.0,146.0,,,,AP,AL,,,,8.0,4.0,,,,1000000.0,300000.0,,,,7.0,21.0,,,,47.0,89.0,,,
8,Al Horford,270.0,94.0,232.0,189.0,140.0,33.0,34.0,35.0,36.0,37.0,67.0,28.0,69.0,63.0,53.0,61.0,28.0,69.0,63.0,24.0,2025.0,782.0,2005.0,1922.0,1413.0,19.18,25.1,17.07,15.63,14.41,798.0,398.0,701.0,616.0,414.0,P,P,P,P,P,52.0,25.0,49.0,30.0,29.0,28000000.0,27500000.0,27000000.0,26500000.0,10000000.0,80.0,29.0,65.0,37.0,37.0,456.0,188.0,530.0,390.0,352.0
9,Al-Farouq Aminu,21.0,31.0,,,,29.0,30.0,,,,18.0,23.0,,,,2.0,14.0,,,,380.0,434.0,,,,10.67,11.96,,,,78.0,102.0,,,,AP,AP,,,,18.0,19.0,,,,9258000.0,9720900.0,,,,17.0,28.0,,,,87.0,110.0,,,


In [13]:
stats_t["Age_2"] = stats_t["Age_1"].transform(lambda x: x.ffill()+1)

stats_t["Age_3"] = stats_t["Age_2"].transform(lambda x: x.ffill()+1)

stats_t["Age_4"] = stats_t["Age_2"].transform(lambda x: x.ffill()+1)

stats_t["Age_5"] = stats_t["Age_2"].transform(lambda x: x.ffill()+1)


stats_t["AST_2"] = stats_t["AST_1"].transform(lambda x: x.ffill()+1)

stats_t["AST_3"] = stats_t["AST_1"].transform(lambda x: x.ffill())

stats_t["AST_4"] = stats_t["AST_1"].transform(lambda x: x.ffill())

stats_t["AST_5"] = stats_t["AST_1"].transform(lambda x: x.ffill())


stats_t["G_2"] = stats_t["G_1"].transform(lambda x: x.ffill())

stats_t["G_3"] = stats_t["G_1"].transform(lambda x: x.ffill())

stats_t["G_4"] = stats_t["G_1"].transform(lambda x: x.ffill())

stats_t["G_5"] = stats_t["G_1"].transform(lambda x: x.ffill())

stats_t["GS_2"] = stats_t["GS_1"].transform(lambda x: x.ffill())

stats_t["GS_3"] = stats_t["GS_1"].transform(lambda x: x.ffill())

stats_t["GS_4"] = stats_t["GS_1"].transform(lambda x: x.ffill())

stats_t["GS_5"] = stats_t["GS_1"].transform(lambda x: x.ffill())

stats_t["MP_2"] = stats_t["MP_1"].transform(lambda x: x.ffill())

stats_t["MP_3"] = stats_t["MP_1"].transform(lambda x: x.ffill())

stats_t["MP_4"] = stats_t["MP_1"].transform(lambda x: x.ffill())

stats_t["MP_5"] = stats_t["MP_1"].transform(lambda x: x.ffill())

stats_t["PER_2"] = stats_t["PER_1"].transform(lambda x: x.ffill())

stats_t["PER_3"] = stats_t["PER_1"].transform(lambda x: x.ffill())

stats_t["PER_4"] = stats_t["PER_1"].transform(lambda x: x.ffill())

stats_t["PER_5"] = stats_t["PER_1"].transform(lambda x: x.ffill())

stats_t["PTS_2"] = stats_t["PTS_1"].transform(lambda x: x.ffill())

stats_t["PTS_3"] = stats_t["PTS_1"].transform(lambda x: x.ffill())

stats_t["PTS_4"] = stats_t["PTS_1"].transform(lambda x: x.ffill())

stats_t["PTS_5"] = stats_t["PTS_1"].transform(lambda x: x.ffill())

stats_t["STL_2"] = stats_t["STL_1"].transform(lambda x: x.ffill())

stats_t["STL_3"] = stats_t["STL_1"].transform(lambda x: x.ffill())

stats_t["STL_4"] = stats_t["STL_1"].transform(lambda x: x.ffill())

stats_t["STL_5"] = stats_t["STL_1"].transform(lambda x: x.ffill())

stats_t["Salario_2"] = stats_t["Salario_1"].transform(lambda x: x.ffill())

stats_t["Salario_3"] = stats_t["Salario_1"].transform(lambda x: x.ffill())

stats_t["Salario_4"] = stats_t["Salario_1"].transform(lambda x: x.ffill())

stats_t["Salario_5"] = stats_t["Salario_1"].transform(lambda x: x.ffill())

stats_t["TOV_2"] = stats_t["TOV_1"].transform(lambda x: x.ffill())

stats_t["TOV_3"] = stats_t["TOV_1"].transform(lambda x: x.ffill())

stats_t["TOV_4"] = stats_t["TOV_1"].transform(lambda x: x.ffill())

stats_t["TOV_5"] = stats_t["TOV_1"].transform(lambda x: x.ffill())

stats_t["TRB_2"] = stats_t["TRB_1"].transform(lambda x: x.ffill())

stats_t["TRB_3"] = stats_t["TRB_1"].transform(lambda x: x.ffill())

stats_t["TRB_4"] = stats_t["TRB_1"].transform(lambda x: x.ffill())

stats_t["TRB_5"] = stats_t["TRB_1"].transform(lambda x: x.ffill())

stats_t["PER_2"] = stats_t["PER_1"].transform(lambda x: x.ffill())

stats_t["PER_3"] = stats_t["PER_1"].transform(lambda x: x.ffill())

stats_t["PER_4"] = stats_t["PER_1"].transform(lambda x: x.ffill())

stats_t["PER_5"] = stats_t["PER_1"].transform(lambda x: x.ffill())


In [14]:
df_numeric = stats_t.select_dtypes(include=np.number)

In [15]:
X = df_numeric.drop(["Salario_5","Salario_4","Salario_3","Salario_2","Salario_1"], axis = 1)
 
y = df_numeric.Salario_5

In [16]:
X_train, X_test, y_train, y_test = tts(X, y,
                                       train_size=0.8,
                                       test_size=0.2,
                                       random_state=42
                                      )

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((618, 50), (155, 50), (618,), (155,))

In [17]:
from lazypredict.Supervised import LazyRegressor

#Split the data into training and testing sets (if not already split)

# Initialize Lazy Regressor
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
# Fit and score all models
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [00:10<00:00,  3.96it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000267 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6395
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 50
[LightGBM] [Info] Start training from score 5653667.071197
                               Adjusted R-Squared  R-Squared       RMSE  \
Model                                                                     
PoissonRegressor                             0.40       0.60 4522638.38   
GammaRegressor                               0.33       0.55 4783542.48   
Lasso                                        0.27       0.51 4991304.41   
Ridge                                        0.27       0.51 4991852.22   
TransformedTargetRegressor                   0.27       0.51 4992417.62   
LinearRegression                             0.27       0.51 4992417.62   
RidgeCV                                      0.27     




In [18]:
from sklearn.ensemble import RandomForestRegressor as rfr
rfr = rfr(random_state=42)

rfr.fit(X_train, y_train)

y_pred = rfr.predict(X_test)

In [19]:
rfr

In [20]:
pd.set_option('display.float_format', '{:.2f}'.format)
y_pred = pd.DataFrame(y_pred)

rmse = mean_squared_error(y_test, y_pred, squared=False)


a = pd.DataFrame(y_test)

a = a.reset_index(drop = True)

y_pred["real"]= a

y_pred["error"]= y_pred[0] - y_pred["real"]

y_pred

Unnamed: 0,0,real,error
0,8809070.3,19000000.0,-10190929.7
1,1079338.1,934168.0,145170.1
2,12305423.86,25595700.0,-13290276.14
3,10601723.08,1620564.0,8981159.08
4,3254748.74,5170564.0,-1915815.26
5,16069153.42,13437500.0,2631653.42
6,2166763.38,1737145.0,429618.38
7,1475446.83,1445697.0,29749.83
8,255874.72,898310.0,-642435.28
9,1954791.51,707770.0,1247021.51


In [21]:
rmse

5533988.979504351

In [None]:
import joblib
joblib.dump(rfr, 'modelo_random_forest.joblib')