In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

#### Assessing the data

In [2]:
data = pd.read_csv('./data/clean/open_shot.csv')

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
data.head()

Unnamed: 0,game_id,matchup,location,win,final_margin,shot_number,period,game_clock,shot_clock,dribbles,touch_time,shot_dist,pts_type,shot_result,closest_defender,closest_defender_id,close_def_dist,fgm,pts,player_name,player_id,defender_wingspan,close_def_dist_in,open_shot
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,1,1,1:09,10.8,2,1.9,7.7,2,made,"Anderson, Alan",101187,1.3,1,2,brian roberts,203148,81.5,15.6,0
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,2,1,0:14,3.4,0,0.8,28.2,3,missed,"Bogdanovic, Bojan",202711,6.1,0,0,brian roberts,203148,82.76,73.2,1
2,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,3,1,0:00,,3,2.7,10.1,2,missed,"Bogdanovic, Bojan",202711,0.9,0,0,brian roberts,203148,82.76,10.8,0
3,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,4,2,11:47,10.3,2,1.9,17.2,2,missed,"Brown, Markel",203900,3.4,0,0,brian roberts,203148,80.75,40.8,1
4,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,5,2,10:34,10.9,2,2.7,3.7,2,missed,"Young, Thaddeus",201152,1.1,0,0,brian roberts,203148,83.5,13.2,0


In [5]:
data.shape

(128069, 24)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128069 entries, 0 to 128068
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   game_id              128069 non-null  int64  
 1   matchup              128069 non-null  object 
 2   location             128069 non-null  object 
 3   win                  128069 non-null  object 
 4   final_margin         128069 non-null  int64  
 5   shot_number          128069 non-null  int64  
 6   period               128069 non-null  int64  
 7   game_clock           128069 non-null  object 
 8   shot_clock           122502 non-null  float64
 9   dribbles             128069 non-null  int64  
 10  touch_time           128069 non-null  float64
 11  shot_dist            128069 non-null  float64
 12  pts_type             128069 non-null  int64  
 13  shot_result          128069 non-null  object 
 14  closest_defender     128069 non-null  object 
 15  closest_defender_

In [7]:
data['location'].value_counts()

location
A    64135
H    63934
Name: count, dtype: int64

In [8]:
data['win'].value_counts()

win
W    64595
L    63474
Name: count, dtype: int64

In [9]:
len(data['player_name'].value_counts())

281

#### Feature engineering

In [10]:
# Dropping unwanted columns 
data.drop(columns=['game_id', 'matchup', 'location', 'win', 'closest_defender', 'closest_defender_id', 'player_name', 'player_id'], inplace=True)

In [11]:
# To split the game clock in minutes and seconds
data[['minutes', 'seconds']] = data['game_clock'].str.split(':', expand=True)

# Removing the game_clock column
data.drop('game_clock', axis=1, inplace=True)

# Changing the data type of the column 
data['minutes'] = data['minutes'].astype(int)
data['seconds'] = data['seconds'].astype(int)

In [12]:
# Final data set
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128069 entries, 0 to 128068
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   final_margin       128069 non-null  int64  
 1   shot_number        128069 non-null  int64  
 2   period             128069 non-null  int64  
 3   shot_clock         122502 non-null  float64
 4   dribbles           128069 non-null  int64  
 5   touch_time         128069 non-null  float64
 6   shot_dist          128069 non-null  float64
 7   pts_type           128069 non-null  int64  
 8   shot_result        128069 non-null  object 
 9   close_def_dist     128069 non-null  float64
 10  fgm                128069 non-null  int64  
 11  pts                128069 non-null  int64  
 12  defender_wingspan  127495 non-null  float64
 13  close_def_dist_in  128069 non-null  float64
 14  open_shot          128069 non-null  int64  
 15  minutes            128069 non-null  int64  
 16  se

#### Data Preprocessing

In [13]:
# Imputing the data
imputer_median = SimpleImputer(strategy='median')
columns_to_impute = ['shot_clock', 'defender_wingspan']
data[columns_to_impute] = imputer_median.fit_transform(data[columns_to_impute])

In [14]:
# Normalizing the data
numerical_columns = data.select_dtypes(include=['number']).columns.tolist()
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

In [15]:
# One hot encoding the dataframe
def encodeShotResult(shortResult):
    if(shortResult == 'made'):
        return 1
    else:
        return 0
data['shot_result_encoded'] = data['shot_result'].apply(encodeShotResult)

# Dropping the shot_result column
data.drop(columns=['shot_result'], inplace=True)

In [16]:
data.head()

Unnamed: 0,final_margin,shot_number,period,shot_clock,dribbles,touch_time,shot_dist,pts_type,close_def_dist,fgm,pts,defender_wingspan,close_def_dist_in,open_shot,minutes,seconds,shot_result_encoded
0,1.797845,-1.168389,-1.289067,-0.292136,-0.006715,-0.284492,-0.660541,-0.599944,-1.024154,1.100776,0.886569,-0.373512,-1.024154,-1.080561,-1.269292,-1.145761,1
1,1.797845,-0.956221,-1.289067,-1.604967,-0.5818,-0.645898,1.645698,1.666821,0.717225,-0.90845,-0.881819,-0.064315,0.717225,0.925445,-1.55947,-0.858488,0
2,1.797845,-0.744052,-1.289067,-0.026022,0.280827,-0.021652,-0.390542,-0.599944,-1.169269,-0.90845,-0.881819,-0.064315,-1.169269,-1.080561,-1.55947,-1.662853,0
3,1.797845,-0.531884,-0.411809,-0.380841,-0.006715,-0.284492,0.408204,-0.599944,-0.262301,-0.90845,-0.881819,-0.557558,-0.262301,0.925445,1.63248,1.037515,0
4,1.797845,-0.319716,-0.411809,-0.274395,-0.006715,-0.021652,-1.110539,-0.599944,-1.096712,-0.90845,-0.881819,0.117276,-1.096712,-1.080561,1.342303,0.290605,0


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128069 entries, 0 to 128068
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   final_margin         128069 non-null  float64
 1   shot_number          128069 non-null  float64
 2   period               128069 non-null  float64
 3   shot_clock           128069 non-null  float64
 4   dribbles             128069 non-null  float64
 5   touch_time           128069 non-null  float64
 6   shot_dist            128069 non-null  float64
 7   pts_type             128069 non-null  float64
 8   close_def_dist       128069 non-null  float64
 9   fgm                  128069 non-null  float64
 10  pts                  128069 non-null  float64
 11  defender_wingspan    128069 non-null  float64
 12  close_def_dist_in    128069 non-null  float64
 13  open_shot            128069 non-null  float64
 14  minutes              128069 non-null  float64
 15  seconds          

#### Defining the model

In [18]:
# Define a simple neural network for classification
class SimpleClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

# Set random seed for reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x117333190>

In [19]:
# Define hyperparameters
input_size = 16  # For example, if working with 28x28 images
hidden_size = 128
output_size = 1  # For binary classification

learning_rate = 0.001
batch_size = 64
epochs = 10

#### Loading the data 

In [20]:
# Splitting the data into train and test.
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(data, data['shot_result_encoded']): 
    strat_train_set = data.iloc[train_index]
    strat_test_set = data.iloc[test_index]

In [21]:
# Defining the train and test dataset
train_data = strat_train_set.drop(columns=['shot_result_encoded'])
train_labels = strat_train_set[['shot_result_encoded']]

test_data = strat_test_set.drop(columns=['shot_result_encoded'])
test_labels = strat_test_set[['shot_result_encoded']]

In [22]:
# Converting to Tensors
X_train_tensor = torch.tensor(train_data.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(train_labels.to_numpy(), dtype=torch.float32)

X_test_tensor = torch.tensor(test_data.to_numpy(), dtype=torch.float32)
y_test_tensor = torch.tensor(test_labels.to_numpy(), dtype=torch.float32)

In [23]:
# Load dataset (MNIST as an example)
transform = transforms.Compose([transforms.ToTensor()])

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

#### Initilizing the model

In [24]:
# Initialize model, loss function, and optimizer
model = SimpleClassifier(input_size, hidden_size, output_size)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#### Training the model

In [25]:
# Training loop
for epoch in range(epochs):
    for inputs, labels in train_loader:
        inputs = inputs.view(-1, input_size) 
        outputs = model(inputs)
        loss = criterion(outputs, labels.float().view(-1, 1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

Epoch 1/10, Loss: 0.00019442851771600544
Epoch 2/10, Loss: 4.083503517904319e-05
Epoch 3/10, Loss: 1.3453008250507992e-05
Epoch 4/10, Loss: 4.570167220663279e-06
Epoch 5/10, Loss: 1.8273751720698783e-06
Epoch 6/10, Loss: 8.605536550021498e-07
Epoch 7/10, Loss: 3.0327927902362717e-07
Epoch 8/10, Loss: 1.2784542491317552e-07
Epoch 9/10, Loss: 5.144890735664376e-08
Epoch 10/10, Loss: 2.0436184300365312e-08


#### Evaluating the model on the test data

In [26]:
# Load the pre-trained model

# model.load_state_dict(torch.load('path_to_your_model.pth'))
model.eval()

# Test the model
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        print(outputs)
        predicted = outputs.view(-1)
        total += labels.size(0)
        labels_flatten = labels.view(-1)
        correct += (predicted == labels_flatten).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy}')

tensor([[2.5012e-08],
        [2.3088e-08],
        [2.2858e-08],
        [3.0809e-08],
        [1.0000e+00],
        [1.0000e+00],
        [2.1907e-08],
        [1.7342e-08],
        [1.0000e+00],
        [1.7663e-08],
        [1.0000e+00],
        [1.4321e-08],
        [2.2043e-08],
        [2.2519e-08],
        [3.3259e-08],
        [1.0000e+00],
        [1.0000e+00],
        [1.0000e+00],
        [1.0000e+00],
        [1.6652e-08],
        [1.0000e+00],
        [1.0000e+00],
        [1.0000e+00],
        [2.1413e-08],
        [1.0000e+00],
        [1.0000e+00],
        [1.0000e+00],
        [1.0000e+00],
        [1.0000e+00],
        [2.2110e-08],
        [1.0000e+00],
        [1.0000e+00],
        [1.9516e-08],
        [1.0000e+00],
        [1.4277e-08],
        [1.0000e+00],
        [1.9519e-08],
        [1.0000e+00],
        [1.3335e-08],
        [1.9015e-08],
        [1.0000e+00],
        [1.9919e-08],
        [2.2726e-08],
        [1.2122e-08],
        [1.0000e+00],
        [3