In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_excel("/Users/jacob/Downloads/Dataset.xlsx")

In [3]:
df['Home'] = df['Match Up'].apply(lambda x: 0 if '@' in x else (1 if 'vs.' in x else None))
df['Game Date'] = pd.to_datetime(df['Game Date'], format='%m/%d/%Y')
df['W/L'] = df['W/L'].apply(lambda x: 1 if x == 'W' else 0)

In [4]:
def get_opponent_row(df, current_row):
    # Extract game details
    match_up = current_row['Match Up']
    game_date = current_row['Game Date']
    current_team = current_row['Team']

    # Identify the opposing team
    team_A, team_B = match_up[:3], match_up[-3:]
    opponent_team = team_B if current_team == team_A else team_A

    # Find the row corresponding to the opponent
    opponent_row = df[
        (df['Game Date'] == game_date) & (df['Team'] == opponent_team)
    ]

    return opponent_row

def specific_win_rate(df):
    for i in range(len(df)):
        row = df.iloc[i,:]
        team_A = row["Match Up"][:3]
        team_B = row["Match Up"][-3:]
        other_games = df[df['Match Up'].str.contains(team_A) & df['Match Up'].str.contains(team_B) & (df["Team"] == row["Team"])]
        date = row["Game Date"]
        prior_games = other_games[other_games["Game Date"] < date]
        if(len(prior_games) == 0):
            prior_rate = 50
        else: 
            prior_wins = (prior_games["W/L"]).sum()
            prior_rate = (prior_wins/len(prior_games)) * 100
        df.loc[i,"Specific Win Percent"] = prior_rate

specific_win_rate(df)

def general_win_percent(df):
    for i in range(len(df)):
        row = df.iloc[i,:]
        team_A = row["Match Up"][:3]
        team_B = row["Match Up"][-3:]
        team_games = df[df["Team"] == row["Team"]]
        date = row["Game Date"]
        prior_games = team_games[team_games["Game Date"] < date]
        if(len(prior_games) == 0):
            prior_rate = 50
        else:
            prior_wins = (prior_games["W/L"]).sum()
            prior_rate = (prior_wins/len(prior_games)) * 100
        df.loc[i,"General Win Percent"] = prior_rate

general_win_percent(df)

def PTS_Var(df):
    for i in range(len(df)):
        row = df.iloc[i,:]
        team = row["Team"]
        team_games = df[df["Team"] == row["Team"]]
        date = row["Game Date"]
        prior_games = team_games[team_games["Game Date"] < date]
        if(len(prior_games) == 0):
            PTS_Var = 0
        else:
            PTS_Var = prior_games["PTS"].var()
        df.loc[i,"PTS Var"] = PTS_Var

PTS_Var(df)

'''def PTS_Var_Diff(df):
    df["PTS Var Diff"] = np.nan  

    for i in range(len(df)): 
            row = df.iloc[i,:]
            my_pts_var = row["PTS Var"]
            opponent = get_opponent_row(df, row)
            opp_pts_var = opponent["PTS Var"]
            diff = my_pts_var - opp_pts_var
            df.loc[i,"PTS Var Diff"] = diff.iloc[0]


PTS_Var_Diff(df)'''

'def PTS_Var_Diff(df):\n    df["PTS Var Diff"] = np.nan  \n\n    for i in range(len(df)): \n            row = df.iloc[i,:]\n            my_pts_var = row["PTS Var"]\n            opponent = get_opponent_row(df, row)\n            opp_pts_var = opponent["PTS Var"]\n            diff = my_pts_var - opp_pts_var\n            df.loc[i,"PTS Var Diff"] = diff.iloc[0]\n\n\nPTS_Var_Diff(df)'

In [5]:
def last_5_win_percent(df):
    for i in range(len(df)):
        row = df.iloc[i, :]
        team_A = row["Match Up"][:3]
        team_B = row["Match Up"][-3:]
        team_games = df[df["Team"] == row["Team"]]
        date = row["Game Date"]
        prior_games = team_games[team_games["Game Date"] < date]
        
        if len(prior_games) == 0:
            # Default to 50% if no prior games exist
            prior_rate = 50
        elif len(prior_games) < 5:
            # Calculate win percentage for all prior games
            prior_wins = (prior_games["W/L"]).sum()
            prior_rate = (prior_wins / len(prior_games)) * 100
        else:
            # Sort and calculate win percentage for the last 5 prior games
            sorted_df = prior_games.sort_values(by='Game Date')
            last5 = sorted_df.tail(5)
            prior_wins = (last5["W/L"]).sum()
            prior_rate = (prior_wins / len(last5)) * 100
        
        # Assign the calculated value
        df.at[i, "Last 5 Win Percent"] = prior_rate

last_5_win_percent(df)

def LFWPD(df):
     for i in range(len(df)):
        row = df.iloc[i,:]
        my_lfwp = row["Last 5 Win Percent"]
        opponent = get_opponent_row(df, row)
        opp_lfwp = opponent["Last 5 Win Percent"]
        diff = my_lfwp - opp_lfwp
        df.loc[i,"LFWP Diff"] = diff.iloc[0]

LFWPD(df)

def calculate_avg_pts(df):
    for i in range(len(df)):
        row = df.iloc[i, :]
        team_games = df[df["Team"] == row["Team"]]
        date = row["Game Date"]

        # Get prior games for the current team
        prior_games = team_games[team_games["Game Date"] < date]
        if len(prior_games) == 0:
            avg_pts = 0  # Default if no prior games
        else:
            avg_pts = prior_games["PTS"].mean()  # Calculate average points

        # Add the calculated average to a new column
        df.loc[i, "Avg PTS"] = avg_pts

calculate_avg_pts(df)

def calculate_avg_plus_minus(df):
    df["Avg +/-"] = None  # Initialize the new column

    for i in range(len(df)):
        row = df.iloc[i]
        current_team = row["Team"]
        game_date = row["Game Date"]

        # Filter for prior games of the same team
        prior_games = df[(df["Team"] == current_team) & (df["Game Date"] < game_date)]

        if len(prior_games) == 0:
            avg_plus_minus = 0  # Default if no prior games
        else:
            avg_plus_minus = prior_games["+/-"].mean()

        # Assign the calculated average
        df.loc[i, "Avg +/-"] = avg_plus_minus

calculate_avg_plus_minus(df)

def calculate_avg_points_conceded(df):
    df["Avg Points Conceded"] = 0  # Initialize the new column with 0

    for i in range(len(df)):
        row = df.iloc[i]
        current_team = row["Team"]
        game_date = row["Game Date"]

        # Find prior games where the current team played
        prior_games = df[(df["Team"] != current_team) & (df["Match Up"].str.contains(current_team)) & (df["Game Date"] < game_date)]

        if len(prior_games) == 0:
            avg_points_conceded = 0  # Default if no prior games
        else:
            # Use the PTS column from the opponent in those games
            avg_points_conceded = prior_games["PTS"].mean()

        # Assign the result
        df.loc[i, "Avg Points Conceded"] = avg_points_conceded

calculate_avg_points_conceded(df)

def weighted_win_rate(df, alpha = .6):
    for i in range(len(df)):
        row = df.iloc[i,:]
        team = row["Team"]
        team_games = df[df["Team"] == row["Team"]]
        date = row["Game Date"]
        prior_games = team_games[team_games["Game Date"] < date]
        if len(prior_games) == 0:
            weighted_win_rate = 50
        else:
            weights = []
            for j in range(len(prior_games)):   
                mult = len(prior_games) - j
                weights.append(alpha ** mult)
            weights = np.array(weights)
            weights = weights/sum(weights)
            weighted_win_rate = sum(weights * prior_games["W/L"])
        df.loc[i,"Weighted General Win Percent"] = weighted_win_rate

weighted_win_rate(df, .2)


In [6]:
'''def calculate_win_rate_difference(df):
    df["Win Rate Difference"] = None  # Initialize the new column

    for i in range(len(df)):
        row = df.iloc[i]
        team_A = row["Match Up"][:3]
        team_B = row["Match Up"][-3:]
        current_team = row["Team"]
        opponent_team = team_B if team_A == current_team else team_A

        # Find the opponent's General Win Percent
        opponent_row = df[
            (df["Team"] == opponent_team) &
            (df["Game Date"] == row["Game Date"])
        ]

        if not opponent_row.empty:
            opponent_rate = opponent_row["General Win Percent"].values[0]
        else:
            opponent_rate = 50  # Default if no opponent info

        # Calculate the difference
        current_team_rate = row["General Win Percent"]
        df.loc[i, "Win Rate Difference"] = current_team_rate - opponent_rate

calculate_win_rate_difference(df)'''


'def calculate_win_rate_difference(df):\n    df["Win Rate Difference"] = None  # Initialize the new column\n\n    for i in range(len(df)):\n        row = df.iloc[i]\n        team_A = row["Match Up"][:3]\n        team_B = row["Match Up"][-3:]\n        current_team = row["Team"]\n        opponent_team = team_B if team_A == current_team else team_A\n\n        # Find the opponent\'s General Win Percent\n        opponent_row = df[\n            (df["Team"] == opponent_team) &\n            (df["Game Date"] == row["Game Date"])\n        ]\n\n        if not opponent_row.empty:\n            opponent_rate = opponent_row["General Win Percent"].values[0]\n        else:\n            opponent_rate = 50  # Default if no opponent info\n\n        # Calculate the difference\n        current_team_rate = row["General Win Percent"]\n        df.loc[i, "Win Rate Difference"] = current_team_rate - opponent_rate\n\ncalculate_win_rate_difference(df)'

In [7]:
'''def calculate_pts_diff(df):
    for i in range(len(df)):
        row = df.iloc[i]
        team_A = row["Match Up"][:3]
        team_B = row["Match Up"][-3:]
        current_team = row["Team"]
        opponent_team = team_B if team_A == current_team else team_A

        # Get the opponent's Avg PTS
        opponent_row = df[
            (df["Team"] == opponent_team) &
            (df["Game Date"] == row["Game Date"])
        ]

        if not opponent_row.empty:
            opponent_avg_pts = opponent_row["Avg PTS"].values[0]
        else:
            opponent_avg_pts = 0  # Default if no data for opponent

        # Calculate the PTS difference
        pts_diff = row["Avg PTS"] - opponent_avg_pts
        df.loc[i, "Avg PTS Difference"] = pts_diff

# Apply the function
calculate_pts_diff(df)
'''

'def calculate_pts_diff(df):\n    for i in range(len(df)):\n        row = df.iloc[i]\n        team_A = row["Match Up"][:3]\n        team_B = row["Match Up"][-3:]\n        current_team = row["Team"]\n        opponent_team = team_B if team_A == current_team else team_A\n\n        # Get the opponent\'s Avg PTS\n        opponent_row = df[\n            (df["Team"] == opponent_team) &\n            (df["Game Date"] == row["Game Date"])\n        ]\n\n        if not opponent_row.empty:\n            opponent_avg_pts = opponent_row["Avg PTS"].values[0]\n        else:\n            opponent_avg_pts = 0  # Default if no data for opponent\n\n        # Calculate the PTS difference\n        pts_diff = row["Avg PTS"] - opponent_avg_pts\n        df.loc[i, "Avg PTS Difference"] = pts_diff\n\n# Apply the function\ncalculate_pts_diff(df)\n'

In [8]:
'''def calculate_plus_minus_difference(df):
    df["+/- Difference"] = None  # Initialize the new column

    for i in range(len(df)):
        row = df.iloc[i]
        team_A = row["Match Up"][:3]
        team_B = row["Match Up"][-3:]
        current_team = row["Team"]
        opponent_team = team_B if team_A == current_team else team_A

        # Find the Avg +/- of the opponent
        opponent_avg_plus_minus = df.loc[
            (df["Team"] == opponent_team) & (df["Game Date"] == row["Game Date"]),
            "Avg +/-"
        ]
        
        if not opponent_avg_plus_minus.empty:
            opponent_avg_plus_minus = opponent_avg_plus_minus.values[0]
        else:
            opponent_avg_plus_minus = 0  # Default if no opponent info

        # Calculate the difference
        df.loc[i, "+/- Difference"] = row["Avg +/-"] - opponent_avg_plus_minus

calculate_plus_minus_difference(df)'''


'def calculate_plus_minus_difference(df):\n    df["+/- Difference"] = None  # Initialize the new column\n\n    for i in range(len(df)):\n        row = df.iloc[i]\n        team_A = row["Match Up"][:3]\n        team_B = row["Match Up"][-3:]\n        current_team = row["Team"]\n        opponent_team = team_B if team_A == current_team else team_A\n\n        # Find the Avg +/- of the opponent\n        opponent_avg_plus_minus = df.loc[\n            (df["Team"] == opponent_team) & (df["Game Date"] == row["Game Date"]),\n            "Avg +/-"\n        ]\n        \n        if not opponent_avg_plus_minus.empty:\n            opponent_avg_plus_minus = opponent_avg_plus_minus.values[0]\n        else:\n            opponent_avg_plus_minus = 0  # Default if no opponent info\n\n        # Calculate the difference\n        df.loc[i, "+/- Difference"] = row["Avg +/-"] - opponent_avg_plus_minus\n\ncalculate_plus_minus_difference(df)'

In [9]:
df.columns

Index(['Team', 'Match Up', 'Game Date', 'W/L', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', '+/-', 'Home', 'Specific Win Percent',
       'General Win Percent', 'PTS Var', 'Last 5 Win Percent', 'LFWP Diff',
       'Avg PTS', 'Avg +/-', 'Avg Points Conceded',
       'Weighted General Win Percent'],
      dtype='object')

In [10]:
def opp_feature_calc(df, feature):
    for i in range(len(df)):
        row = df.iloc[i,:]
        my_val = row[feature]
        opponent = get_opponent_row(df, row)
        opp_val = opponent[feature].iloc[0]
        opp_name = "Opp " + feature
        df.loc[i,opp_name] = opp_val
        diff_name = "Diff " + feature
        diff = my_val - opp_val
        df.loc[i, diff_name] = diff


analysis_features = ['General Win Percent', 'Avg PTS', 'Avg +/-', 'Avg Points Conceded', 'Weighted General Win Percent']

for feature in analysis_features:
    opp_feature_calc(df, feature)



"""def PTS_Var_Diff(df):
    df["PTS Var Diff"] = np.nan  

    for i in range(len(df)): 
            row = df.iloc[i,:]
            my_pts_var = row["PTS Var"]
            opponent = get_opponent_row(df, row)
            opp_pts_var = opponent["PTS Var"]
            diff = my_pts_var - opp_pts_var
            df.loc[i,"PTS Var Diff"] = diff.iloc[0]


PTS_Var_Diff(df)"""

'def PTS_Var_Diff(df):\n    df["PTS Var Diff"] = np.nan  \n\n    for i in range(len(df)): \n            row = df.iloc[i,:]\n            my_pts_var = row["PTS Var"]\n            opponent = get_opponent_row(df, row)\n            opp_pts_var = opponent["PTS Var"]\n            diff = my_pts_var - opp_pts_var\n            df.loc[i,"PTS Var Diff"] = diff.iloc[0]\n\n\nPTS_Var_Diff(df)'

In [11]:
df.columns

Index(['Team', 'Match Up', 'Game Date', 'W/L', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', '+/-', 'Home', 'Specific Win Percent',
       'General Win Percent', 'PTS Var', 'Last 5 Win Percent', 'LFWP Diff',
       'Avg PTS', 'Avg +/-', 'Avg Points Conceded',
       'Weighted General Win Percent', 'Opp General Win Percent',
       'Diff General Win Percent', 'Opp Avg PTS', 'Diff Avg PTS',
       'Opp Avg +/-', 'Diff Avg +/-', 'Opp Avg Points Conceded',
       'Diff Avg Points Conceded', 'Opp Weighted General Win Percent',
       'Diff Weighted General Win Percent'],
      dtype='object')

In [12]:
df.fillna(0, inplace=True)
df.replace('-', 0, inplace=True)
df

df = df.sort_values(by="Game Date")

y = df["W/L"]
X = df.loc[:, ["Home", 'Diff Weighted General Win Percent', 'Avg PTS', "Opp Avg PTS", "Avg Points Conceded", "Diff Avg Points Conceded", "Diff General Win Percent", "PTS Var"]]

X = X.iloc[30:]
y = y.iloc[30:]

X_train = X.iloc[:1944]
X_test = X.iloc[1944:]

y_train = y.iloc[:1944]
y_test = y.iloc[1944:]





In [13]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

In [14]:
def has_non_numeric(df):
    non_numeric_cols = []
    for col in df.columns:
        try:
            pd.to_numeric(df[col])
        except (ValueError, TypeError):
            non_numeric_cols.append(col)
    return non_numeric_cols

has_non_numeric(df)

['Team', 'Match Up']

In [15]:
y.sum()

1215

In [16]:
df.columns

Index(['Team', 'Match Up', 'Game Date', 'W/L', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', '+/-', 'Home', 'Specific Win Percent',
       'General Win Percent', 'PTS Var', 'Last 5 Win Percent', 'LFWP Diff',
       'Avg PTS', 'Avg +/-', 'Avg Points Conceded',
       'Weighted General Win Percent', 'Opp General Win Percent',
       'Diff General Win Percent', 'Opp Avg PTS', 'Diff Avg PTS',
       'Opp Avg +/-', 'Diff Avg +/-', 'Opp Avg Points Conceded',
       'Diff Avg Points Conceded', 'Opp Weighted General Win Percent',
       'Diff Weighted General Win Percent'],
      dtype='object')

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler


# Initialize the logistic regression model
logistic_model = LogisticRegression()

# Fit the model to the training data
logistic_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = logistic_model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 67.49%


In [18]:
import pandas as pd
import numpy as np

# Correlation of features with the target variable
correlations = pd.DataFrame({
    'Feature': X.columns,
    'Correlation with Target': [np.corrcoef(X[col], y)[0, 1] for col in X.columns]
}).sort_values(by='Correlation with Target', ascending=False)

print(correlations)

                             Feature  Correlation with Target
6           Diff General Win Percent                 0.324472
1  Diff Weighted General Win Percent                 0.148317
2                            Avg PTS                 0.120870
0                               Home                 0.086420
7                            PTS Var                 0.043648
3                        Opp Avg PTS                -0.104708
4                Avg Points Conceded                -0.138681
5           Diff Avg Points Conceded                -0.222922


In [19]:
df.columns


Index(['Team', 'Match Up', 'Game Date', 'W/L', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', '+/-', 'Home', 'Specific Win Percent',
       'General Win Percent', 'PTS Var', 'Last 5 Win Percent', 'LFWP Diff',
       'Avg PTS', 'Avg +/-', 'Avg Points Conceded',
       'Weighted General Win Percent', 'Opp General Win Percent',
       'Diff General Win Percent', 'Opp Avg PTS', 'Diff Avg PTS',
       'Opp Avg +/-', 'Diff Avg +/-', 'Opp Avg Points Conceded',
       'Diff Avg Points Conceded', 'Opp Weighted General Win Percent',
       'Diff Weighted General Win Percent'],
      dtype='object')

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

# Fit the model using the training data
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Random Forest Accuracy: {accuracy:.2f}%")



Random Forest Accuracy: 65.02%


In [21]:
X

Unnamed: 0,Home,Diff Weighted General Win Percent,Avg PTS,Opp Avg PTS,Avg Points Conceded,Diff Avg Points Conceded,Diff General Win Percent,PTS Var
29,1,-1.000000,107.000000,108.000000,119.000000,15.000000,-100.000000,0.000000
28,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
44,0,1.000000,119.000000,104.000000,107.000000,-4.000000,100.000000,0.000000
53,0,1.000000,123.000000,114.000000,111.000000,-19.000000,100.000000,0.000000
52,1,1.000000,126.000000,113.000000,119.000000,5.000000,100.000000,0.000000
...,...,...,...,...,...,...,...,...
2432,0,0.185293,119.358025,110.432099,116.395062,7.703704,3.703704,168.857716
2431,0,-0.037323,112.185185,112.703704,113.629630,5.555556,-12.345679,124.277778
2430,0,-0.832295,110.666667,114.691358,113.333333,1.469136,-17.283951,184.600000
2443,1,-0.197120,105.753086,114.716049,112.654321,3.074074,-35.802469,98.238272


In [22]:
from xgboost import XGBClassifier

# Initialize XGBoost model
xgb_model = XGBClassifier(n_estimators=1000, max_depth=1000, random_state=42)

# Fit the model
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb) * 100
print(f"XGBoost Accuracy: {xgb_accuracy:.2f}%")


XGBoost Accuracy: 60.29%


In [23]:
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score

# Train Ridge Regression
ridge_model = Ridge(alpha=1.0)  # Alpha controls the strength of regularization
ridge_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_ridge = ridge_model.predict(X_test)
accuracy_ridge = accuracy_score(y_test, (y_pred_ridge > 0.5).astype(int)) * 100
print(f"Ridge Accuracy: {accuracy_ridge:.2f}%")


Ridge Accuracy: 67.49%


In [24]:
from sklearn.linear_model import Lasso
from sklearn.metrics import accuracy_score

# Train Lasso Regression
lasso_model = Lasso(alpha=.05)  # Alpha controls regularization strength
lasso_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_lasso = lasso_model.predict(X_test)
accuracy_lasso = accuracy_score(y_test, (y_pred_lasso > 0.5).astype(int)) * 100
print(f"Lasso Accuracy: {accuracy_lasso:.2f}%")
lasso_model.coef_


Lasso Accuracy: 69.34%


array([ 0.        , -0.        ,  0.00413479, -0.00360392, -0.        ,
       -0.00770973,  0.00381313,  0.0002786 ])

In [25]:
from sklearn.linear_model import Lasso
import pandas as pd

# Initialize and train the Lasso model
lasso_model = Lasso(alpha=0.1, random_state=42)  # Adjust alpha for stronger/looser regularization
lasso_model.fit(X_train, y_train)

# Extract feature importance (coefficients)
feature_importance = pd.Series(lasso_model.coef_, index=X.columns)

# Filter out features with zero coefficients (not useful)
useful_features = feature_importance[feature_importance != 0].sort_values(ascending=False)

print("Useful features identified by Lasso:")
print(useful_features)


Useful features identified by Lasso:
Diff General Win Percent    0.004853
PTS Var                     0.000265
Diff Avg Points Conceded   -0.003746
dtype: float64


In [26]:
for i in range(100)

SyntaxError: invalid syntax (3346967130.py, line 1)