In [2]:
import pandas as pd
import numpy as np

In [3]:
!python3 -m pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels


In [4]:
df = pd.read_csv("teams_with_form.csv")

In [5]:
df["Date"] = pd.to_datetime(df["Date"], infer_datetime_format=True)

  df["Date"] = pd.to_datetime(df["Date"], infer_datetime_format=True)


In [6]:
# df["Home/Away_code"] = df["Home/Away"].astype("category").cat.codes
# df["Opp_Code"] = df["Opp"].astype("category").cat.codes
# df["Day_Code"] = df["Date"].dt.dayofweek

In [7]:
df = df.sort_values(by=['Date'])

In [8]:
df = df.reset_index(drop=True)
df.index = range(len(df))

In [8]:
def add_target(bean):
    bean["Target"] = bean["W/L"].shift(-1)
    return bean

df = df.groupby("Team", group_keys=False).apply(add_target)

In [9]:
pd.set_option('display.max_rows', None)
df[["Team", "Date", "W/L", "Target"]].sort_values(by=['Team','Date'])

Unnamed: 0,Team,Date,W/L,Target
0,ATL,2023-10-25,0,0.0
27,ATL,2023-10-27,0,0.0
57,ATL,2023-10-29,0,0.0
68,ATL,2023-10-30,0,0.0
99,ATL,2023-11-01,0,0.0
143,ATL,2023-11-04,0,0.0
183,ATL,2023-11-06,0,0.0
212,ATL,2023-11-09,0,0.0
238,ATL,2023-11-11,0,0.0
279,ATL,2023-11-14,0,0.0


In [8]:
df.loc[pd.isnull(df['Target']), "Target"] = 2

In [9]:
df["Target"] = np.where(df["Target"] == "W", 1, np.where(df["Target"] == "L", 0, df["Target"]))

In [10]:
df["W/L"] = (df["W/L"] == "W").astype("int")

In [11]:
df["Target"].value_counts()

Target
0.0    1206
1.0    1202
2.0      30
Name: count, dtype: int64

In [12]:
df.shape

(2438, 49)

In [13]:
def add_target(bean):
    bean["ELO_next"] = bean["ELO"].shift(+1)
    return bean

def add_form(bean):
    bean["Form"] = bean["ELO"] - bean["ELO_next"]
    return bean

df_new = df.groupby(by = "Team", group_keys = False).apply(add_target)
df_new = df_new.groupby(by = "Team", group_keys = False).apply(add_form)

df_new["Form"] = df_new["Form"].round(2)

df = df_new.copy()

In [14]:
df.sort_index(inplace=True)

In [15]:
df.loc[pd.isnull(df['Form']), "Form"] = 0

In [16]:
del df["ELO_next"]

In [17]:
df["Home/Away"] = np.where(df["Home/Away"] == "Home", 1, np.where(df["Home/Away"] == "Away", 0, df["Home/Away"])) #home is 1 away is 0

In [18]:
def extract_values(tuple_str):
    values = tuple_str.strip('()').split(', ')
    return [int(value) for value in values]

df['Wins'], df['Losses'] = zip(*df['Record'].apply(extract_values))

In [19]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha = 1)
split = TimeSeriesSplit(n_splits = 3)
sfs = SequentialFeatureSelector(rr, n_features_to_select = 10, direction = "forward", cv = split)

In [20]:
removed = ["Team", "Opp", "NumWL", "Date", "Target", "Record"]

In [21]:
selected = df.columns[~df.columns.isin(removed)]

In [22]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[selected] = scaler.fit_transform(df[selected])

In [23]:
selected.dropna()
df.dropna(inplace=True)

In [24]:
df['Target'] = df['Target'].astype(int)

In [25]:
sfs.fit(df[selected], df["Target"])

In [26]:
predictors = list(selected[sfs.get_support()])

In [33]:
predictors.remove('Unnamed: 0')

ValueError: list.remove(x): x not in list

In [34]:
predictors.remove('Unnamed: 0.1')

In [35]:
predictors

['W/L', '3P%', 'OREB', 'REB', '+/-', 'ELO', 'Day_Code', 'Wins']

In [36]:
def backtest(data, model, predictors, start = 3, step = 1):
    all_predictions = []

    days = sorted(data["Date"].unique())

    for i in range(start, len(days), step):
        day = days[i]

        train = data[data["Date"] < day]
        test = data[data["Date"] == day]

        model.fit(train[predictors], train["Target"])

        pred = model.predict(test[predictors])
        pred = pd.Series(pred, index = test.index)

        combined = pd.concat([test["Target"], pred], axis = 1)
        combined.columns = ["Actual", "Prediction"]

        all_predictions.append(combined)

    return pd.concat(all_predictions)

In [37]:
predictions = backtest(df, rr, predictors)

In [38]:
predictions

Unnamed: 0,Actual,Prediction
32,0,0
33,0,0
34,1,0
35,0,0
36,0,0
...,...,...
2433,2,1
2434,2,1
2435,2,1
2436,2,0


In [41]:
from sklearn.metrics import accuracy_score

predictions = predictions[predictions["Actual"] != 2]
accuracy_score(predictions["Actual"], predictions["Prediction"])

0.6075789473684211

In [42]:
df.groupby("Home/Away").apply(lambda x: x[x["W/L"] == 1].shape[0] / x.shape[0])

Home/Away
0.0    0.0
1.0    0.0
dtype: float64

In [43]:
df_rolling = df[list(selected) + ["W/L", "Team", "Date"]]

In [44]:
def find_avg(bean):
    rolling = bean[list(selected) + ["W/L"]].rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(by = ["Team"], group_keys = False).apply(find_avg)

In [45]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols

df = pd.concat([df, df_rolling], axis = 1)

In [46]:
#last_column = df.columns[-1]
#df = df.drop(last_column, axis=1)

In [47]:
df = df.dropna()

In [48]:
def shift_col(bean, col_name):
    next_col = bean[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby(by = "Team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["Home_next"] = add_col(df, "Home/Away")
df["Opp_next"] = add_col(df, "Opp")
df["Date_next"] = add_col(df, "Date")

In [49]:
df = df.copy()

In [50]:
#df = df.reset_index(drop=True)
#df.index = range(len(df))

In [51]:
full = df.merge(df[rolling_cols + ["Opp_next", "Date_next", "Team"]],
         left_on=["Team", "Date_next"],
         right_on=["Opp_next", "Date_next"]
        )


In [52]:
removed = list(full.columns[full.dtypes == "object"]) + removed
removed.append("Date_next")

In [53]:
selected = full.columns[~full.columns.isin(removed)]

In [54]:
sfs.fit(full[selected], full["Target"])

In [56]:
predictors = list(selected[sfs.get_support()])

IndexError: boolean index did not match indexed array along dimension 0; dimension is 154 but corresponding boolean dimension is 406

In [57]:
predictors

['W/L', '3P%', 'OREB', 'REB', '+/-', 'ELO', 'Day_Code', 'Wins']

In [None]:
predictions = backtest(full, rr, predictors)

In [None]:
accuracy_score(predictions["Actual"], predictions["Prediction"])

In [None]:
predictions.sort_index(inplace=True)

In [None]:
predictions

In [70]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Team,Opp,Home/Away,Date,W/L,MIN,PTS,FGM,FGA,...,PIE_10,Elo_10,Form_10,Wins_10,Losses_10,W/L_10,W/L_10.1,Home_next,Opp_next,Date_next
1361,0.985348,MIL,ORL,0.0,2024-04-14,0.0,0.0,0.178571,0.102564,0.153846,...,0.532137,0.617211,0.476751,0.722222,0.397059,0.5,0.5,,,NaT
1362,0.984615,HOU,LAC,0.0,2024-04-14,1.0,0.0,0.511905,0.615385,0.692308,...,0.526365,0.58805,0.54009,0.560317,0.527941,0.6,0.6,,,NaT
1363,0.983883,SAS,DET,1.0,2024-04-14,1.0,0.0,0.595238,0.589744,0.461538,...,0.542434,0.344824,0.537779,0.296825,0.839706,0.6,0.6,,,NaT
1364,0.991209,LAL,NOP,0.0,2024-04-14,1.0,0.0,0.607143,0.564103,0.384615,...,0.575975,0.67296,0.522169,0.669841,0.476471,0.6,0.6,,,NaT
1365,1.0,WAS,BOS,0.0,2024-04-14,0.0,0.0,0.583333,0.589744,0.692308,...,0.507956,0.16883,0.508006,0.206349,0.866176,0.4,0.4,,,NaT
