In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [None]:
nba = pd.read_csv('/kaggle/input/nba-shot-logs/shot_logs.csv')
df = nba.copy()
df.head()

In [None]:
df.columns

In [None]:
df.columns = df.columns.str.lower()
df.columns

In [None]:
df.pts_type.value_counts()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.corr(), cmap='coolwarm')

In [None]:
# Inspecting touch_time
df['touch_time'].min()

Touch time is the amount of time that the player has the ball possession before making a shot. Therefore, we cannot have a negative value as seen above. It's definitely an outlier.

In [None]:
sns.boxplot(df['touch_time'])

In [None]:
(df['touch_time'] < 0).sum()

In [None]:
with pd.option_context('display.max_columns', None):
    display(df.loc[df['touch_time'] < 0, :].head())

In [None]:
Q1 = df['touch_time'].quantile(0.25)
Q3 = df['touch_time'].quantile(0.75)
IQR = Q3 - Q1

Lower_Whisker = (Q1 -  1.5*IQR)
Upper_Whisker = (Q3 + 1.5*IQR)

outliers = df[(df.touch_time < Lower_Whisker) | (df.touch_time > Upper_Whisker)]

In [None]:
# drop negative outliers
outliers = df[(df.touch_time < 0)]
df = df.drop(outliers.index)
sns.boxplot(df['touch_time'])

In [None]:
# Convert game clock to seconds
df['game_clock'] = df['game_clock'].apply(
    lambda x: 60*int(x.split(':')[0]) + int(x.split(':')[1]))

In [None]:
# Converting type of shot (2 or 3 points) to categorical
#df['PTS_TYPE'] = (df['PTS_TYPE'] == 3) * 1

In [None]:
# Converting location to categorical
df['location'] = (df['location'] == 'H') * 1

In [None]:
# Renaming columns
#df = df.rename(columns = {
    #'FGM':'hit',
    #'PTS_TYPE':'3pts_shot',
    #'LOCATION':'home_game'})

In [None]:
df.groupby(['player_name'],as_index=False )['shot_dist','pts'].mean().sort_values('pts', ascending=False)

In [None]:
df.groupby(['player_name'])['pts_type'].count().sort_values(ascending=False)

In [None]:
three_pointer = df[df.pts_type == 3]
two_pointer = df[df.pts_type == 2]

In [None]:
two_pointer.groupby(['player_name'], as_index=False)['pts_type'].count().sort_values('pts_type', ascending=False)

In [None]:
three_pointer.groupby(['player_name'], as_index=False)['pts_type'].count().sort_values('pts_type', ascending=False)

In [None]:
lam = df[df.player_name == 'lamarcus aldridge']
lam.head()

In [None]:
lam.groupby(['shot_result', 'pts_type']).size().unstack(level=0, fill_value=0)

In [None]:
lam_1 = lam.groupby(['shot_result', 'pts_type']).size().unstack(level=0, fill_value=0)

In [None]:
lam_1.plot(kind='bar',stacked=False,title='Bar Chart showing points scored')

In [None]:
lam.groupby(['shot_result', 'dribbles']).size().unstack(fill_value=0)

In [None]:
lam.groupby(['touch_time','shot_result']).size().unstack(fill_value=0).sort_values('missed',ascending=False)

In [None]:
lam_2= lam.groupby(['shot_result'])['shot_dist'].mean()
lam_2.plot(kind='bar',stacked=False,title='Bar Chart showing mean for made and missed shot distances')

## Modelling to Field Goals Made (fgm)

### Data Preparation

In [None]:
df1 = df.copy()
df1.head()

In [None]:
# Drop extaneous columns
df1 = df1.drop(columns =[ 'game_id',
    'matchup',
    'w',
    'final_margin',
    'closest_defender_player_id',
    'shot_result',                     
    'player_id',
    'closest_defender',
    'player_name',
    'pts'])

df1.head()

In [None]:
df1.shape

In [None]:
df1.isnull().sum()

In [None]:
#df['game_won'] = np.where(df['w'] == 'W', 1, 0)
#df1['w'] = np.where(df1['w'] == 'W', 1, 0)

In [None]:
#df1.shot_result = df1.shot_result.map({"made":1,"missed":0})

In [None]:
df1.dropna(axis=1, inplace =True)

In [None]:
df1.info()

In [None]:
X  = df1.drop('fgm', axis=1)
y = df1['fgm']

In [None]:
print(X.shape, y.shape)

In [None]:
# Splitting to training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Logistic Regression

In [None]:
# Logistic Regression

logistic = LogisticRegression()

logistic.fit(X_train, y_train)

prediction_lr = logistic.predict(X_test)

# Test score
score_logistic = logistic.score(X_test, y_test)
print('test accuracy:')
print(score_logistic)
print('train accuracy:')
print(logistic.score(X_train, y_train))

print(classification_report(y_test,prediction_lr))
metrics.plot_roc_curve(logistic, X_test, y_test)

### Gradient Boosting 

In [None]:
# Gradient Boosting modelling
gb = GradientBoostingClassifier(max_features='auto',
                                n_estimators=10, 
                                random_state=42,
                                max_depth=5,
                                min_samples_leaf=100,
                                learning_rate = 0.08) 

gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

# Test score
score_gb = gb.score(X_test, y_test)
print('test accuracy:')
print(score_gb)
print('train accuracy:')
print(gb.score(X_train, y_train))

#Classification Report
report = classification_report(y_test, y_pred, output_dict=True)
df_classification_report = pd.DataFrame(report).transpose()
print(df_classification_report)
metrics.plot_roc_curve(gb, X_test, y_test)

### Random Forest 

In [None]:
# Random Forest modelling
rfcla = RandomForestClassifier(max_features='auto',
                               n_estimators=10, 
                               n_jobs=-1, random_state=42,
                               criterion='entropy',
                               max_depth=5,
                               min_samples_leaf=10) 

rfcla.fit(X_train, y_train)

y_pred = rfcla.predict(X_test)

# Test score
score_rfcla = rfcla.score(X_test, y_test)
print('test accuracy:')
print(score_rfcla)
print('train accuracy:')
print(rfcla.score(X_train, y_train))

#Classification Report
report = classification_report(y_test, y_pred, output_dict=True)
df_classification_report = pd.DataFrame(report).transpose()
print(df_classification_report)
metrics.plot_roc_curve(rfcla, X_test, y_test)

# Outlier Detector

In [None]:
pip install alibi-detect

In [None]:
from alibi_detect.od import IForest
from alibi_detect.utils.data import create_outlier_batch
from alibi_detect.utils.fetching import fetch_detector
from alibi_detect.utils.saving import save_detector, load_detector
from alibi_detect.utils.visualize import plot_instance_score, plot_roc

Assume that a model is trained on normal instances of the dataset (not outliers) and standardization is applied:

In [None]:
np.random.seed(0)
normal_batch = create_outlier_batch(X, y, n_samples=50000, perc_outlier=0)
X_train, y_train = normal_batch.data.astype('float'), normal_batch.target
print(X_train.shape, y_train.shape)
print('{}% outliers'.format(100 * y_train.mean()))

In [None]:
mean, stdev = X_train.mean(axis=0), X_train.std(axis=0)

Apply standardization:

In [None]:
X_train = (X_train - mean) / stdev

### Load Outlier Detector (Isolation Forest)

In [None]:
load_outlier_detector = True

In [None]:
od = IForest(threshold=None, n_estimators=100)

# train
od.fit(X_train)

 Let’s assume we have some data which we know contains around 20% outliers

In [None]:
np.random.seed(0)
perc_outlier = 20
threshold_batch = create_outlier_batch(X, y, n_samples=1000, perc_outlier=perc_outlier)
X_threshold, y_threshold = threshold_batch.data.astype('float'), threshold_batch.target
X_threshold = (X_threshold - mean) / stdev
print('{}% outliers'.format(100 * y_threshold.mean()))

In [None]:
od.infer_threshold(X_threshold, threshold_perc=100-perc_outlier)
print('New threshold: {}'.format(od.threshold))

### Detect Outliers

We now generate a batch of data with 10% outliers and detect the outliers in the batch.

In [None]:
np.random.seed(1)
outlier_batch = create_outlier_batch(X, y, n_samples=1000, perc_outlier=10)
X_outlier, y_outlier = outlier_batch.data.astype('float'), outlier_batch.target
X_outlier = (X_outlier - mean) / stdev
print(X_outlier.shape, y_outlier.shape)
print('{}% outliers'.format(100 * y_outlier.mean()))

### Predict Outlier

In [None]:
od_preds = od.predict(X_outlier, return_instance_score=True)

### Display Results

In [None]:
from sklearn.metrics import confusion_matrix, f1_score
labels = outlier_batch.target_names
y_pred = od_preds['data']['is_outlier']
f1 = f1_score(y_outlier, y_pred)
print('F1 score: {:.4f}'.format(f1))

cm = confusion_matrix(y_outlier, y_pred)
df_cm = pd.DataFrame(cm, index=labels, columns=labels)
sns.heatmap(df_cm, annot=True, cbar=True, linewidths=.5)
plt.show()

In [None]:
plot_instance_score(od_preds, y_outlier, labels, od.threshold)

In [None]:
roc_data = {'IF': {'scores': od_preds['data']['instance_score'], 'labels': y_outlier}}
plot_roc(roc_data)

# Variable Auto Encoders 

Assume that a model is trained on normal instances of the dataset (not outliers) and standardization is applied:

In [None]:
np.random.seed(0)
normal_batch = create_outlier_batch(X, y, n_samples=100000, perc_outlier=0)
X_train, y_train = normal_batch.data.astype('float'), normal_batch.target
print(X_train.shape, y_train.shape)
print('{}% outliers'.format(100 * y_train.mean()))

Apply standardization:

In [None]:
mean, stdev = X_train.mean(axis=0), X_train.std(axis=0)

In [None]:
X_train = (X_train - mean) / stdev

In [None]:
load_outlier_detector = True

In [None]:
import tensorflow as tf
tf.keras.backend.clear_session()
from tensorflow.keras.layers import Dense, InputLayer
from alibi_detect.models.losses import elbo
from alibi_detect.od import OutlierVAE
from alibi_detect.utils.visualize import plot_instance_score, plot_feature_outlier_tabular, plot_roc

In [None]:
# define model, initialize and train  outlier detector

n_features = X_train.shape[1]
latent_dim = 2

encoder_net = tf.keras.Sequential(
      [
          InputLayer(input_shape=(n_features,)),
          Dense(20, activation=tf.nn.relu),
          Dense(15, activation=tf.nn.relu),
          Dense(7, activation=tf.nn.relu)
      ])

decoder_net = tf.keras.Sequential(
      [
          InputLayer(input_shape=(latent_dim,)),
          Dense(7, activation=tf.nn.relu),
          Dense(15, activation=tf.nn.relu),
          Dense(20, activation=tf.nn.relu),
          Dense(n_features, activation=None)
      ])
# initialize outlier detector
od = OutlierVAE(threshold=None,  # threshold for outlier score
                score_type='mse',  # use MSE of reconstruction error for outlier detection
                encoder_net=encoder_net,  # can also pass VAE model instead
                decoder_net=decoder_net,  # of separate encoder and decoder
                latent_dim=latent_dim,
                samples=5)
    # train
od.fit(X_train,
       loss_fn=elbo,
       cov_elbo=dict(sim=.01),
       epochs=20,
       verbose=True)

In [None]:
np.random.seed(0)
perc_outlier = 10
threshold_batch = create_outlier_batch(X, y, n_samples=1000, perc_outlier=perc_outlier)
X_threshold, y_threshold = threshold_batch.data.astype('float'), threshold_batch.target
X_threshold = (X_threshold - mean) / stdev
print('{}% outliers'.format(100 * y_threshold.mean()))

In [None]:
od.infer_threshold(X_threshold, threshold_perc=100-perc_outlier)
print('New threshold: {}'.format(od.threshold))

### Detect Outliers 

We now generate a batch of data with 10% outliers and detect the outliers in the batch.

In [None]:
np.random.seed(1)
outlier_batch = create_outlier_batch(X, y, n_samples=1000, perc_outlier=10)
X_outlier, y_outlier = outlier_batch.data.astype('float'), outlier_batch.target
X_outlier = (X_outlier - mean) / stdev
print(X_outlier.shape, y_outlier.shape)
print('{}% outliers'.format(100 * y_outlier.mean()))

### Predict Outliers

In [None]:
od_preds = od.predict(X_outlier,
                      outlier_type='instance',    # use 'feature' or 'instance' level
                      return_feature_score=True,  # scores used to determine outliers
                      return_instance_score=True)
print(list(od_preds['data'].keys()))

### Display Results

F1 Score and Confusion Matrix

In [None]:
labels = outlier_batch.target_names
y_pred = od_preds['data']['is_outlier']
f1 = f1_score(y_outlier, y_pred)
print('F1 score: {:.4f}'.format(f1))
cm = confusion_matrix(y_outlier, y_pred)
df_cm = pd.DataFrame(cm, index=labels, columns=labels)
sns.heatmap(df_cm, annot=True, cbar=True, linewidths=.5)
plt.show()

In [None]:
plot_instance_score(od_preds, y_outlier, labels, od.threshold)

In [None]:
roc_data = {'VAE': {'scores': od_preds['data']['instance_score'], 'labels': y_outlier}}
plot_roc(roc_data)

In [None]:
X_recon = od.vae(X_outlier).numpy()  # reconstructed instances by the VAE

In [None]:
plot_feature_outlier_tabular(od_preds,
                             X_outlier,
                             X_recon=X_recon,
                             threshold=od.threshold,
                             instance_ids=None,  # pass a list with indices of instances to display
                             max_instances=5,  # max nb of instances to display
                             top_n=5,  # only show top_n features ordered by outlier score
                             outliers_only=False,  # only show outlier predictions
                             feature_names=X.columns,  # add feature names
                             figsize=(20, 30))