In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
player_17_18_df = pd.read_csv("player_17_18_stats.csv")
player_18_19_df = pd.read_csv("player_18_19_stats.csv")

In [3]:
del player_17_18_df['Unnamed: 0']
del player_18_19_df['Unnamed: 0']

In [4]:
player_17_18_df = player_17_18_df.dropna(how="any",axis=1)

In [5]:
player_18_19_df.all_star_bool = np.where(player_18_19_df.all_star_bool.isnull(), 0, 1)

In [6]:
cols = [c for c in player_17_18_df.columns if c not in ['Rk','Player','all_star_bool']]
correlations = player_17_18_df[cols].corr().abs().unstack().sort_values(kind="quicksort").reset_index()
correlations = correlations[correlations['level_0'] != correlations['level_1']]
correlations.columns = ['feature1', 'feature2', 'correlation']

In [7]:
correlations = correlations.drop_duplicates('correlation')

In [8]:
remove_cols = \
list(set(correlations[correlations.correlation > 0.7].feature1.tolist()))

In [9]:
correlations2 = player_17_18_df[cols].drop(labels=remove_cols,axis=1).\
corr().abs().unstack().sort_values(kind="quicksort").reset_index()
correlations2 = correlations2[correlations2['level_0'] != correlations2['level_1']]
correlations2.columns = ['feature1', 'feature2', 'correlation']

In [10]:
len(list(set(correlations2.feature1.tolist() + correlations2.feature2.tolist())))

10

In [11]:
new_cols = list(set(correlations2.feature1.tolist() + correlations2.feature2.tolist()))

In [12]:
X = player_17_18_df[new_cols].values
y = player_17_18_df.all_star_bool.values

In [13]:
X_test = player_18_19_df[new_cols].values
y_test = player_18_19_df.all_star_bool

In [14]:
gnb = GaussianNB()
gnb.fit(X, y)

y_pred = gnb.predict(X_test)

# Summary of the predictions made by the classifier
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Accuracy score
print('accuracy is',accuracy_score(y_pred,y_test))

             precision    recall  f1-score   support

          0       0.99      0.98      0.99       682
          1       0.61      0.85      0.71        26

avg / total       0.98      0.97      0.98       708

[[668  14]
 [  4  22]]
accuracy is 0.9745762711864406


## Naive Bayes when you don't remove any correlated features

In [15]:
X2 = player_17_18_df[cols].values
y2 = player_17_18_df.all_star_bool.values

In [16]:
X_test2 = player_18_19_df[cols].values
y_test2 = player_18_19_df.all_star_bool

In [17]:
gnb2 = GaussianNB()
gnb2.fit(X2, y2)

y_pred2 = gnb2.predict(X_test2)

# Summary of the predictions made by the classifier
print(classification_report(y_test2, y_pred2))
print(confusion_matrix(y_test2, y_pred2))

# Accuracy score
print('accuracy is',accuracy_score(y_pred2,y_test2))

             precision    recall  f1-score   support

          0       1.00      0.93      0.96       682
          1       0.33      0.92      0.48        26

avg / total       0.97      0.93      0.94       708

[[633  49]
 [  2  24]]
accuracy is 0.9279661016949152


Which Feature is most important? Features that have high variability for each class are important because higher sigma maximizes the conditional probability of x_i given y

In [42]:
# Top 10 important features for model with all the features
player_17_18_df[new_cols].columns[gnb2.sigma_[1,:].argsort()][-10:]

Index(['2P', 'FTA', 'AST', '3PA', 'DRB', 'TRB', 'FGA', '2PA', 'MP', 'PTS'], dtype='object')

Points, Minutes Played, 2 Point Goals Attempted, Field Goals Attempted, Total Rebounds, Defensive Rebounds...

In [47]:
# Top 10 important features for model with uncorrelated features
player_17_18_df[cols].columns[gnb.sigma_[1,:].argsort()]

Index(['FG', 'MP', 'G', 'FGA', '3PA', '2P', '2PA', 'GS', 'Age', '3P'], dtype='object')

3 Points, Age, Games Started, 2 Point Goals Attempted, 2 Point Goals, 3 Point Goals Attempted...

## Other Classifiers (v.s. Naive Bayes)

#### Logistic Regression (without any hyperparameter Tuning)

In [18]:
X3 = player_17_18_df[cols].values
y3 = player_17_18_df.all_star_bool.values

X_test3 = player_18_19_df[cols].values
y_test3 = player_18_19_df.all_star_bool

In [19]:
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression()
lgr.fit(X3, y3)

y_pred3 = lgr.predict(X_test3)

In [20]:
# Summary of the predictions made by the classifier
print(classification_report(y_test3, y_pred3))
print(confusion_matrix(y_test3, y_pred3))

# Accuracy score
print('accuracy is',accuracy_score(y_pred3,y_test3))

             precision    recall  f1-score   support

          0       0.99      0.98      0.99       682
          1       0.64      0.81      0.71        26

avg / total       0.98      0.98      0.98       708

[[670  12]
 [  5  21]]
accuracy is 0.9759887005649718


#### Decision Tree Classifier (without any hyperparameter tuning)

In [21]:
X4 = player_17_18_df[cols].values
y4 = player_17_18_df.all_star_bool.values

X_test4 = player_18_19_df[cols].values
y_test4 = player_18_19_df.all_star_bool

In [24]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X4, y4)
y_pred4 = dtc.predict(X_test4)

In [25]:
# Summary of the predictions made by the classifier
print(classification_report(y_test4, y_pred4))
print(confusion_matrix(y_test4, y_pred4))

# Accuracy score
print('accuracy is',accuracy_score(y_pred4,y_test4))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99       682
          1       0.74      0.65      0.69        26

avg / total       0.98      0.98      0.98       708

[[676   6]
 [  9  17]]
accuracy is 0.9788135593220338
