# Dependencies

In [26]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.metrics import classification_report

# Cleaning up the training data

In [27]:
def clean_data(df):        
    # Dropping columns that are not needed
    df = df.drop(['Pos'], axis=1)

    # Scaling the data
    Scaler = MinMaxScaler()
    df = pd.DataFrame(Scaler.fit_transform(df), columns=df.columns)
    
    return df

In [28]:
df_train = pd.read_csv('Data/Training/overall_training_data.csv',index_col=0)
df_train = df_train.dropna(axis=0)
df_train = df_train.drop(['Player'], axis=1)
df_train = clean_data(df_train)
df_train

Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,All Star
0,0.208333,0.817073,0.411652,0.305413,0.355333,0.181,0.054941,0.069,0.205,0.241503,...,0.300221,0.251462,0.184615,0.190,0.298650,0.199762,0.407859,0.241965,0.147826,0.0
1,0.041667,0.353659,0.079065,0.310541,0.326000,0.291,0.050066,0.042,0.087,0.114490,...,0.408389,0.192982,0.138462,0.125,0.301297,0.203336,0.456640,0.255871,0.191304,0.0
2,0.083333,0.841463,0.566581,0.317949,0.366000,0.005,0.096381,0.122,0.193,0.282648,...,0.273731,0.304094,0.415385,0.310,0.311358,0.202145,0.424119,0.247528,0.200000,0.0
3,0.375000,0.195122,0.068502,0.318519,0.329333,0.000,0.162010,0.119,0.296,0.366726,...,0.273731,0.204678,0.107692,0.125,0.305004,0.200556,0.413279,0.244129,0.173913,0.0
4,0.416667,0.939024,0.800576,0.298575,0.355333,0.377,0.042003,0.011,0.097,0.094812,...,0.377483,0.286550,0.230769,0.235,0.295208,0.202542,0.406504,0.243820,0.147826,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3823,0.083333,0.719512,0.678297,0.373789,0.396667,0.455,0.084005,0.016,0.115,0.116279,...,0.728477,0.502924,0.169231,0.400,0.317183,0.232724,0.390244,0.263597,0.443478,1.0
3824,0.333333,0.695122,0.428937,0.344729,0.384000,0.157,0.070129,0.126,0.212,0.298748,...,0.417219,0.327485,0.276923,0.285,0.316124,0.208896,0.410569,0.249691,0.217391,0.0
3825,0.458333,0.012195,0.000960,0.365242,0.166667,0.000,0.000000,0.809,0.269,0.964222,...,0.911700,0.192982,0.076923,0.105,0.262113,0.206910,0.121951,0.182324,0.173913,0.0
3826,0.166667,0.256098,0.070423,0.331054,0.398000,0.000,0.049503,0.090,0.244,0.296959,...,0.344371,0.210526,0.107692,0.130,0.310034,0.201350,0.401084,0.241656,0.165217,0.0


# Training the model

In [29]:
training_cols = [x for x in df_train.columns if x != 'All Star']
X_train = df_train[training_cols]
y_train = df_train['All Star']

LR = LogisticRegression()
LR = LR.fit(X_train,y_train)
y_train_prediction = LR.predict(X_train)

In [30]:
# Accuracy of model on training data using sklearn.metrics
print(metrics.accuracy_score(y_train, y_train_prediction))

0.975705329153605


# Running Model on Test Data

In [31]:
df_test = pd.read_csv('Data/Testing/Test_Data.csv', index_col=0)
df_test = df_test.dropna(axis=0)
player_list = df_test['Player']
df_test = df_test.drop(['Player'], axis=1)
df_test = clean_data(df_test)

# Predict using the LR Model
y_prediction_test = LR.predict(df_test)

In [32]:
# Creating dataframe
result = pd.DataFrame(player_list)
result['Predicted All Star'] = y_prediction_test.tolist()

# Actual NBA all stars
actual_all_stars = ['Bradley Beal', 'Kyrie Irving', 'Giannis Antetokounmpo', 'Joel Embiid', 'Jaylen Brown', 
 'James Harden', 'Zach LaVine', 'Ben Simmons', 'Julius Randle', 'Jayson Tatum', 'Nikola Vučević',
 'Stephen Curry', 'Luka Dončić', 'LeBron James', 'Kawhi Leonard', 'Nikola Jokić', 'Anthony Davis', 'Damian Lillard', 
 'Donovan Mitchell', 'Chris Paul', 'Paul George', 'Zion Williamson', 'Rudy Gobert', 'Kevin Durant','Domantas Sabonis',
                   'Jimmy Butler']

result['Actual All Star'] = 0.0
for index, row in result.iterrows():
    if result.at[index, 'Player'] in actual_all_stars:
        result.at[index, 'Actual All Star'] = 1.0
result

Unnamed: 0,Player,Predicted All Star,Actual All Star
0,Precious Achiuwa,0.0,0.0
1,Jaylen Adams,0.0,0.0
2,Steven Adams,0.0,0.0
3,Bam Adebayo,1.0,0.0
4,LaMarcus Aldridge,0.0,0.0
...,...,...,...
540,Delon Wright,0.0,0.0
541,Thaddeus Young,0.0,0.0
542,Trae Young,1.0,0.0
543,Cody Zeller,0.0,0.0


In [33]:
print(classification_report(result['Actual All Star'], result['Predicted All Star']))

              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99       515
         1.0       0.71      0.89      0.79        28

    accuracy                           0.98       543
   macro avg       0.85      0.94      0.89       543
weighted avg       0.98      0.98      0.98       543

