# Basic Data PreProcessing

In [1]:
import pandas as pd

# Load the datasets
nba_data = pd.read_csv('nba_data.csv')
player_mvp_stats = pd.read_csv('player_mvp_stats.csv')


In [2]:
team_mapping = {
    'LAL': 'LAL', 'PHO': 'PHO', 'DAL': 'DAL', 'MIA': 'MIA', 'CLE': 'CLE', 'WSB': 'WSB', 'CHI': 'CHI', 'GSW': 'GSW',
    'IND': 'IND', 'WAS': 'WAS', 'MIN': 'MIN', 'BOS': 'BOS', 'HOU': 'HOU', 'DEN': 'DEN', 'ORL': 'ORL', 'NOH': 'NOP',
    'TOR': 'TOR', 'SAC': 'SAC', 'CHO': 'CHO', 'POR': 'POR', 'DET': 'DET', 'PHI': 'PHI', 'UTA': 'UTA', 'MIL': 'MIL',
    'VAN': 'MEM', 'SEA': 'OKC', 'NJN': 'BRK', 'NOK': 'NOP', 'LAC': 'LAC', 'OKC': 'OKC', 'ATL': 'ATL', 'CHA': 'CHO',
    'MEM': 'MEM', 'NYK': 'NYK', 'NOP': 'NOP', 'BRK': 'BRK', 'SAS': 'SAS', 'CHH': 'CHO'
}


In [3]:
player_mvp_stats['Tm'] = player_mvp_stats['Tm'].map(team_mapping)


In [4]:
combined_data = pd.merge(nba_data, player_mvp_stats, left_on='team', right_on='Tm', how='inner')


In [5]:
combined_data.head()


Unnamed: 0,Unnamed: 0_x,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,...,0.0,0.0,Atlanta Hawks,37,45,0.451,15.0,98.2,100.0,-2.23
1,0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,...,0.0,0.0,Atlanta Hawks,37,45,0.451,15.0,98.2,100.0,-2.23
2,0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,...,0.0,0.0,Atlanta Hawks,37,45,0.451,15.0,98.2,100.0,-2.23
3,0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,...,0.0,0.0,Atlanta Hawks,37,45,0.451,15.0,98.2,100.0,-2.23
4,0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,...,0.0,0.0,Atlanta Hawks,37,45,0.451,15.0,98.2,100.0,-2.23


In [6]:
# Select and display only the 'Player' and 'Tm' columns
player_and_team_data = combined_data[['Player', 'Tm']]
print(player_and_team_data)


                    Player   Tm
0                 Acie Law  ATL
1               Al Horford  ATL
2        Jeremy Richardson  ATL
3              Joe Johnson  ATL
4           Josh Childress  ATL
...                    ...  ...
8287936    Randolph Morris  NYK
8287937    Renaldo Balkman  NYK
8287938    Stephon Marbury  NYK
8287939    Wilson Chandler  NYK
8287940      Zach Randolph  NYK

[8287941 rows x 2 columns]


In [7]:
# Select and display only the 'team' and 'team_opp' columns
team_and_team_opp_data = combined_data[['team', 'team_opp']]
print(team_and_team_opp_data)


        team team_opp
0        ATL      DAL
1        ATL      DAL
2        ATL      DAL
3        ATL      DAL
4        ATL      DAL
...      ...      ...
8287936  NYK      MIL
8287937  NYK      MIL
8287938  NYK      MIL
8287939  NYK      MIL
8287940  NYK      MIL

[8287941 rows x 2 columns]


# Adding Weights

In [8]:
# Get all player names

unique_players = combined_data['Player'].unique()

pd.set_option('display.max_rows', None)

print(unique_players)

pd.reset_option('display.max_rows')


['Acie Law' 'Al Horford' 'Jeremy Richardson' ... 'Stuart Gray'
 'Maurice Ndour' 'Mindaugas Kuzminskas']


In [9]:
# Sample mapping of player names to their corresponding weights
player_weights = {
    'Nikola Jokic': 0.98,
    'Joel Embiid': 0.97,
    'Stephen Curry': 0.97,
    'Luka Doncic': 0.96,
    'Giannis Antetokounmpo': 0.96,
    'LeBron James': 0.96,
    'Kevin Durant': 0.96,
    'Jayson Tatum': 0.95,
    'Devin Booker': 0.94,
    'Jimmy Butler': 0.94,
    'Donovan Mitchell': 0.93,
    'Damian Lillard': 0.93,
    'Anthony Davis': 0.93,
    'Shai Gilgeous-Alexander': 0.93,
    'Kawhi Leonard': 0.92,
    'Ja Morant': 0.92,
    'Anthony Edwards': 0.90,
    'Tyrese Haliburton': 0.90,
    'Zion Williamson': 0.89,
    'James Harden': 0.89,
    'Kyrie Irving': 0.89,
    'Paul George': 0.89,
    'Jaylen Brown': 0.89,
    'Trae Young': 0.88,
    'Jamal Murray': 0.88,
    'Jalen Brunson': 0.88,
    'De’Aaron Fox': 0.88,
    'Bam Adebayo': 0.87,
    'Tyrese Maxey': 0.87,
    'Kristaps Porzingis': 0.87,
    'Victor Wembanyama': 0.86,
    'Scottie Barnes': 0.86,
    'Bradley Beal': 0.86,
    'Dejounte Murray': 0.86,
    'Jaren Jackson Jr.': 0.86,
    'Domantas Sabonis': 0.86,
    'Lauri Markkanen': 0.86,
    'Jrue Holiday': 0.86,
    'Mikal Bridges': 0.86,
    'Evan Mobley': 0.85,
    'Cade Cunningham': 0.85,
    'LaMelo Ball': 0.85,
    'Darius Garland': 0.85,
    'Karl-Anthony Towns': 0.85,
    'DeMar DeRozan': 0.85,
    'Pascal Siakam': 0.85,
    'Tyler Herro': 0.85,
    'C.J. McCollum': 0.85,
    'Khris Middleton': 0.85,
    'Desmond Bane': 0.85,
    'Paolo Banchero': 0.84,
    'Rudy Gobert': 0.84,
    'Zach LaVine': 0.84,
    'Brandon Ingram': 0.84,
    'Aaron Gordon': 0.84,
    'Nicolas Claxton': 0.84,
    'Kyle Kuzma': 0.84,
    'Alperen Sengun': 0.84,
    'Klay Thompson': 0.83,
    'Chet Holmgren': 0.83,
    'Draymond Green': 0.83,
    'R.J. Barrett': 0.83,
    'Jarrett Allen': 0.83,
    'Michael Porter Jr.': 0.83,
    'Julius Randle': 0.83,
    'Anfernee Simons': 0.83,
    'Jalen Duren': 0.83,
    'Derrick White': 0.83,
    'Malcolm Brogdon': 0.83,
    'Myles Turner': 0.83,
    'Chris Paul': 0.82,
    'Jalen Green': 0.82,
    'Shaedon Sharpe': 0.82,
    'Ausar Thompson': 0.82,
    'Deandre Ayton': 0.82,
    'Franz Wagner': 0.82,
    'OG Anunoby': 0.82,
    'Fred VanVleet': 0.82,
    'Jonas Valanciunas': 0.82,
    'Nikola Vucevic': 0.82,
    'Tobias Harris': 0.82,
    'Jalen Williams': 0.82,
    'Marcus Smart': 0.82,
    'Malik Monk': 0.82,
    'Brook Lopez': 0.82,
    'Bobby Portis': 0.82,
    'Mitchell Robinson': 0.82,
    'Cam Thomas': 0.82,
    'Devin Vassell': 0.82,
    'Bojan Bogdanovic': 0.82,
    'Austin Reaves': 0.82,
    'Josh Giddey': 0.81,
    'Lonzo Ball': 0.81,
    'Jordan Poole': 0.81,
    'Russell Westbrook': 0.81,
    'Robert Williams III': 0.81,
    'Clint Capela': 0.81,
    'John Collins': 0.81,
    'Jerami Grant': 0.81,
    'Onyeka Okongwu': 0.81
}

# Add player weights to combined_data
combined_data['player_weight'] = combined_data['Player'].map(player_weights)


In [10]:
default_weight = 0.5

combined_data['player_weight'] = combined_data['Player'].map(player_weights).fillna(default_weight)

print(combined_data[['Player', 'player_weight']])


                    Player  player_weight
0                 Acie Law            0.5
1               Al Horford            0.5
2        Jeremy Richardson            0.5
3              Joe Johnson            0.5
4           Josh Childress            0.5
...                    ...            ...
8287936    Randolph Morris            0.5
8287937    Renaldo Balkman            0.5
8287938    Stephon Marbury            0.5
8287939    Wilson Chandler            0.5
8287940      Zach Randolph            0.5

[8287941 rows x 2 columns]


In [11]:
lebron_weight = combined_data.loc[combined_data['Player'] == 'LeBron James', 'player_weight'].values[0]
steph_weight = combined_data.loc[combined_data['Player'] == 'Stephen Curry', 'player_weight'].values[0]

print(lebron_weight)
print(steph_weight)


0.96
0.97


In [12]:
team_weights = {
  'pts': 0.833,
  'fg%_max': 1.0,
  '3p%_max': 0.667,
  'ft%_max': 0.417,
  'trb_max': 1.0,
  'ast_max': 0.917,
  'stl_max': 0.75,
  'blk_max': 0.75
}


for col, weight in team_weights.items():
  combined_data[col] = combined_data[col] * weight

print(combined_data.head())


   Unnamed: 0_x     mp   mp.1    fg   fga    fg%   3p   3pa  3p%    ft  ...  \
0             0  240.0  240.0  39.0  81.0  0.481  6.0  20.0  0.3  14.0  ...   
1             0  240.0  240.0  39.0  81.0  0.481  6.0  20.0  0.3  14.0  ...   
2             0  240.0  240.0  39.0  81.0  0.481  6.0  20.0  0.3  14.0  ...   
3             0  240.0  240.0  39.0  81.0  0.481  6.0  20.0  0.3  14.0  ...   
4             0  240.0  240.0  39.0  81.0  0.481  6.0  20.0  0.3  14.0  ...   

   Share           Team   W   L   W/L%    GB  PS/G   PA/G   SRS  player_weight  
0    0.0  Atlanta Hawks  37  45  0.451  15.0  98.2  100.0 -2.23            0.5  
1    0.0  Atlanta Hawks  37  45  0.451  15.0  98.2  100.0 -2.23            0.5  
2    0.0  Atlanta Hawks  37  45  0.451  15.0  98.2  100.0 -2.23            0.5  
3    0.0  Atlanta Hawks  37  45  0.451  15.0  98.2  100.0 -2.23            0.5  
4    0.0  Atlanta Hawks  37  45  0.451  15.0  98.2  100.0 -2.23            0.5  

[5 rows x 194 columns]


In [14]:
print(combined_data['won'])


0           True
1           True
2           True
3           True
4           True
           ...  
8287936    False
8287937    False
8287938    False
8287939    False
8287940    False
Name: won, Length: 8287941, dtype: bool


In [15]:
combined_data['won'] = combined_data['won'].astype(int)


In [16]:
print(combined_data['won'])


0          1
1          1
2          1
3          1
4          1
          ..
8287936    0
8287937    0
8287938    0
8287939    0
8287940    0
Name: won, Length: 8287941, dtype: int64


In [19]:
print(combined_data['won'])


0          1
1          1
2          1
3          1
4          1
          ..
8287936    0
8287937    0
8287938    0
8287939    0
8287940    0
Name: won, Length: 8287941, dtype: int64


# Neural Network Feedforward Functional API

In [20]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Extract features and target variable
X = combined_data[['pts', 'fg%_max', '3p%_max', 'ft%_max', 'trb_max', 'ast_max', 'stl_max', 'blk_max', 'player_weight']]
y = combined_data['won']  # Use the 'won' column as the target variable

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Scale the features
X_test_scaled = scaler.transform(X_test)

# Define the model architecture using the Functional API
num_features = X_train_scaled.shape[1]

input_layer = tf.keras.layers.Input(shape=(num_features,))
hidden1 = tf.keras.layers.Dense(128, activation='relu')(input_layer)
dropout1 = tf.keras.layers.Dropout(0.3)(hidden1)
hidden2 = tf.keras.layers.Dense(64, activation='relu')(dropout1)
dropout2 = tf.keras.layers.Dropout(0.3)(hidden2)
hidden3 = tf.keras.layers.Dense(32, activation='relu')(dropout2)
player_weight_input = tf.keras.layers.Input(shape=(1,))
merged = tf.keras.layers.Concatenate()([hidden3, player_weight_input])
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(merged)

model = tf.keras.models.Model(inputs=[input_layer, player_weight_input], outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
model.fit([X_train_scaled, X_train['player_weight']], y_train,
          epochs=50, batch_size=32,
          validation_data=([X_test_scaled, X_test['player_weight']], y_test))

# Prediction
# For new predictions, you would preprocess the new data (let's call it new_data) in the same way as X_train
# Example:
# new_data_scaled = scaler.transform(new_data_features)
# predictions = model.predict([new_data_scaled, new_data_player_weight])


2023-11-19 19:17:41.231724: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 9)]                  0         []                            
                                                                                                  
 dense (Dense)               (None, 128)                  1280      ['input_1[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 128)                  0         ['dense[0][0]']               
                                                                                                  
 dense_1 (Dense)             (None, 64)                   8256      ['dropout[0][0]']             
                                                                                              

<keras.src.callbacks.History at 0x135d41950>

In [28]:
import numpy as np


In [34]:
def get_team_data(team_name):
    # Filter the DataFrame for rows matching the team name
    team_data_filtered = combined_data[(combined_data['team'] == team_name) | (combined_data['team_opp'] == team_name)]

    # Check if the filtered DataFrame is empty
    if team_data_filtered.empty:
        raise ValueError(f"No data found for team: {team_name}")

    # Select only the numeric columns relevant for the prediction
    numeric_columns = ['pts', 'fg%_max', '3p%_max', 'ft%_max', 'trb_max', 'ast_max', 'stl_max', 'blk_max']
    team_data_numeric = team_data_filtered[numeric_columns]

    # Calculate the mean of the numeric columns
    team_data_mean = team_data_numeric.mean()
    average_player_weight = combined_data[combined_data['Tm'] == team_name]['player_weight'].mean()
    return team_data_mean, average_player_weight

def predict_game_outcome(team1_name, team2_name):
    # Retrieve data for each team
    team1_data, team1_player_weight = get_team_data(team1_name)
    team2_data, team2_player_weight = get_team_data(team2_name)

    # Prepare the data for prediction by extracting the relevant features
    features_team1 = [
        team1_data['pts'], team1_data['fg%_max'], team1_data['3p%_max'],
        team1_data['ft%_max'], team1_data['trb_max'], team1_data['ast_max'],
        team1_data['stl_max'], team1_data['blk_max'], team1_player_weight
    ]

    features_team2 = [
        team2_data['pts'], team2_data['fg%_max'], team2_data['3p%_max'],
        team2_data['ft%_max'], team2_data['trb_max'], team2_data['ast_max'],
        team2_data['stl_max'], team2_data['blk_max'], team2_player_weight
    ]

    # Combine into a single array and reshape
    input_features = np.array([features_team1, features_team2]).reshape(2, -1)

    # Scale the features including player_weight
    input_features_scaled = scaler.transform(input_features)

    # Separate the scaled player_weight from the main features
    main_input_features_scaled = input_features_scaled[:, :-1]
    player_weight_features = input_features_scaled[:, -1].reshape(-1, 1)

    # Predict the outcome using both inputs
    predictions = model.predict([main_input_features_scaled, player_weight_features])

    # Interpret the prediction for team 1
    if predictions[0] > 0.5:
        return f"{team1_name} is predicted to win against {team2_name}"
    else:
        return f"{team2_name} is predicted to win against {team1_name}"

# User interaction and prediction call
team1 = input("Enter team one: ")
team2 = input("Enter team two: ")

prediction = predict_game_outcome(team1, team2)
print(prediction)



ValueError: in user code:

    File "/Users/purthasmacbookpro/miniconda3/envs/test/lib/python3.11/site-packages/keras/src/engine/training.py", line 2416, in predict_function  *
        return step_function(self, iterator)
    File "/Users/purthasmacbookpro/miniconda3/envs/test/lib/python3.11/site-packages/keras/src/engine/training.py", line 2401, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/purthasmacbookpro/miniconda3/envs/test/lib/python3.11/site-packages/keras/src/engine/training.py", line 2389, in run_step  **
        outputs = model.predict_step(data)
    File "/Users/purthasmacbookpro/miniconda3/envs/test/lib/python3.11/site-packages/keras/src/engine/training.py", line 2357, in predict_step
        return self(x, training=False)
    File "/Users/purthasmacbookpro/miniconda3/envs/test/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/purthasmacbookpro/miniconda3/envs/test/lib/python3.11/site-packages/keras/src/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model" is incompatible with the layer: expected shape=(None, 9), found shape=(None, 8)


In [35]:
combined_data.head()

Unnamed: 0,Unnamed: 0_x,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,...,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,player_weight
0,0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,...,0.0,Atlanta Hawks,37,45,0.451,15.0,98.2,100.0,-2.23,0.5
1,0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,...,0.0,Atlanta Hawks,37,45,0.451,15.0,98.2,100.0,-2.23,0.5
2,0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,...,0.0,Atlanta Hawks,37,45,0.451,15.0,98.2,100.0,-2.23,0.5
3,0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,...,0.0,Atlanta Hawks,37,45,0.451,15.0,98.2,100.0,-2.23,0.5
4,0,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.3,14.0,...,0.0,Atlanta Hawks,37,45,0.451,15.0,98.2,100.0,-2.23,0.5
