This notebook was compiled and combined from successful cells in previous drafts of this notebook/model.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('kobedata.csv')  # Load the dataset

# Step 1: Ensure 'game_date' is in datetime format and sort
# df['game_date'] = pd.to_datetime(df['game_date'])
df = df.sort_values(by=['game_date','game_event_id'])

# Step 2: Split the DataFrame
known_flag_df = df[df['shot_made_flag'].notna()]
unknown_flag_df = df[df['shot_made_flag'].isna()]

# Step 3: Calculate cumulative shot percentage for known data
known_flag_df['cumulative_percentage'] = known_flag_df.groupby('action_type')['shot_made_flag'].expanding().mean().reset_index(level=0, drop=True)

# Step 4: Iterate over the unknown entries to fill in cumulative percentage
for index, row in unknown_flag_df.iterrows():
    train_data = known_flag_df[known_flag_df['game_date'] < row['game_date']]
    
    if not train_data.empty:
        cumulative_percentage = train_data['shot_made_flag'].expanding().mean().iloc[-1]
        unknown_flag_df.at[index, 'cumulative_percentage'] = cumulative_percentage

# Combine DataFrames back
final_df = pd.concat([known_flag_df, unknown_flag_df], ignore_index=True)

# Step 5: Prepare features and target variable for training
X = final_df[['cumulative_percentage', 'shot_distance']]  # Add more features as needed
y = final_df['shot_made_flag'].dropna()

# Ensure X has corresponding y values
X = X.loc[y.index]

# Step 6: Train a model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Step 7: Predict missing values
unknown_features = unknown_flag_df[['cumulative_percentage', 'shot_distance']]  # Use the same features
predictions = model.predict(unknown_features)

# Assign predictions to the unknown_flag_df
unknown_flag_df['shot_made_flag'] = predictions


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  known_flag_df['cumulative_percentage'] = known_flag_df.groupby('action_type')['shot_made_flag'].expanding().mean().reset_index(level=0, drop=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_flag_df.at[index, 'cumulative_percentage'] = cumulative_percentage
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

In [5]:
unknown_flag_df[['shot_id','shot_made_flag']]

Unnamed: 0,shot_id,shot_made_flag
22906,22907,1.0
22908,22909,0.0
22925,22926,0.0
22926,22927,0.0
22929,22930,0.0
...,...,...
22876,22877,0.0
22886,22887,0.0
22891,22892,1.0
22898,22899,0.0


Here we are reassigning probability values since the competition wants to calculate log loss. Subsequently, we will export our submission file.

In [18]:
# Step 7: Predict probabilities for missing values
unknown_features = unknown_flag_df[['cumulative_percentage', 'shot_distance']]  # Use the same features
probabilities = model.predict_proba(unknown_features)

# Assign the probability of making the shot (class 1) to the unknown_flag_df
unknown_flag_df['shot_made_probability'] = probabilities[:, 1]  # Get probabilities for class 1



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_flag_df['shot_made_probability'] = probabilities[:, 1]  # Get probabilities for class 1


In [7]:
unknown_flag_df[['shot_id','shot_made_flag','shot_made_probability']]

Unnamed: 0,shot_id,shot_made_flag,shot_made_probability
22906,22907,1.0,0.610000
22908,22909,0.0,0.333898
22925,22926,0.0,0.280000
22926,22927,0.0,0.020000
22929,22930,0.0,0.020000
...,...,...,...
22876,22877,0.0,0.060000
22886,22887,0.0,0.070000
22891,22892,1.0,0.773333
22898,22899,0.0,0.060000


In [8]:
unknown_flag_df.sort_values(by='shot_id')

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id,cumulative_percentage,shot_made_probability
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,...,Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1,0.448318,0.65
7,Jump Shot,Jump Shot,254,20000012,34.0163,1,28,-118.2688,8,3,...,Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,8,0.448318,0.84
16,Driving Layup Shot,Layup,100,20000019,34.0443,0,0,-118.2698,0,1,...,Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-11-01,LAL vs. UTA,UTA,17,0.448307,0.38
19,Driving Layup Shot,Layup,249,20000019,34.0443,0,0,-118.2698,10,3,...,Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-11-01,LAL vs. UTA,UTA,20,0.448307,0.38
32,Jump Shot,Jump Shot,4,20000047,33.9683,163,76,-118.1068,11,1,...,Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-11-04,LAL @ VAN,VAN,33,0.447847,0.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30668,Jump Shot,Jump Shot,386,49900087,33.8223,-23,222,-118.2928,7,4,...,Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-06-16,LAL @ IND,IND,30669,0.450634,0.09
30680,Tip Shot,Tip Shot,213,49900088,34.0443,0,0,-118.2698,0,2,...,Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-06-19,LAL vs. IND,IND,30681,0.449646,0.71
30682,Running Jump Shot,Jump Shot,226,49900088,33.9963,-68,48,-118.3378,11,3,...,In The Paint (Non-RA),8-16 ft.,1610612747,Los Angeles Lakers,2000-06-19,LAL vs. IND,IND,30683,0.449646,0.14
30686,Running Jump Shot,Jump Shot,268,49900088,33.9513,16,93,-118.2538,5,3,...,In The Paint (Non-RA),8-16 ft.,1610612747,Los Angeles Lakers,2000-06-19,LAL vs. IND,IND,30687,0.449646,0.11


In [10]:
submission=pd.DataFrame()

In [11]:
submission['shot_id']=unknown_flag_df['shot_id']

In [12]:
submission['shot_made_flag']=unknown_flag_df['shot_made_probability']

In [13]:
submission.head()

Unnamed: 0,shot_id,shot_made_flag
22906,22907,0.61
22908,22909,0.333898
22925,22926,0.28
22926,22927,0.02
22929,22930,0.02


In [15]:
submission.sort_values(by='shot_id')

Unnamed: 0,shot_id,shot_made_flag
0,1,0.65
7,8,0.84
16,17,0.38
19,20,0.38
32,33,0.07
...,...,...
30668,30669,0.09
30680,30681,0.71
30682,30683,0.14
30686,30687,0.11


In [17]:
submission.to_csv('kobeSubmission.csv', index=False)