In [17]:
import os
import sys

import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier


# Determine the project root directory
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.models import save_model
from utils.formats import encode_column, get_win_pct_from_record

df = pd.read_csv("../data/team_season_data.csv")

df['WinPCT'] = df["WINS"] / (df["WINS"] + df['LOSSES'])

x_columns = ['Conference', 'ConferenceRecord', 'HOME', 'ROAD', 'OT', 'ThreePTSOrLess', 'TenPTSOrMore',
       'AheadAtHalf', 'BehindAtHalf', 'TiedAtHalf', 'AheadAtThird',
       'BehindAtThird', 'TiedAtThird', 'Score100PTS', 'OppScore100PTS',
       'OppOver500', 'LeadInFGPCT', 'LeadInReb', 'FewerTurnovers', 'DiffPointsPG']

y_column = 'WinPCT'

x = df[x_columns]
y = df[y_column]

x = encode_column(col="Conference", x=x)

record_cols = ['ConferenceRecord', 'HOME', 'ROAD', 'OT', 'ThreePTSOrLess', 'TenPTSOrMore',
       'AheadAtHalf', 'BehindAtHalf', 'TiedAtHalf', 'AheadAtThird',
       'BehindAtThird', 'TiedAtThird', 'Score100PTS', 'OppScore100PTS',
       'OppOver500', 'LeadInFGPCT', 'LeadInReb', 'FewerTurnovers']

x[record_cols] = x[record_cols].applymap(lambda z: get_win_pct_from_record(z))

print(x.isna().sum())  # Check for missing values
x = x.dropna()  # Remove rows with missing values if any
y = y.iloc[x.index]

#model_log = LogisticRegression(max_iter=1000).fit(x,y)
model_lin = LinearRegression().fit(x,y)
#model_rand_for = RandomForestClassifier().fit(x,y)

path = "../models/"
#save_model(path + "team_model_log.pkl", model_log)
save_model(path + "team_model_lin.pkl", model_lin)
#save_model(path + "team_model_rand_for.pkl", model_rand_for)

ConferenceRecord     0
HOME                 0
ROAD                 0
OT                   5
ThreePTSOrLess       0
TenPTSOrMore         0
AheadAtHalf          0
BehindAtHalf         5
TiedAtHalf          40
AheadAtThird         0
BehindAtThird        2
TiedAtThird         44
Score100PTS          0
OppScore100PTS       0
OppOver500          28
LeadInFGPCT          0
LeadInReb            0
FewerTurnovers       0
DiffPointsPG         0
Conference_East      0
Conference_West      0
dtype: int64


In [None]:
get_win_pct_from_record("40-24")

In [13]:
df[y_column]

0      0.682927
1      0.707317
2      0.682927
3      0.634146
4      0.609756
         ...   
711    0.256098
712    0.182927
713    0.268293
714    0.256098
715    0.170732
Name: WinPCT, Length: 716, dtype: float64

In [18]:
df.columns

Index(['Unnamed: 0', 'LeagueID', 'SeasonID', 'TeamID', 'TeamCity', 'TeamName',
       'Conference', 'ConferenceRecord', 'PlayoffRank', 'ClinchIndicator',
       'Division', 'DivisionRecord', 'DivisionRank', 'WINS', 'LOSSES',
       'WinPCT', 'LeagueRank', 'Record', 'HOME', 'ROAD', 'L10', 'Last10Home',
       'Last10Road', 'OT', 'ThreePTSOrLess', 'TenPTSOrMore', 'LongHomeStreak',
       'strLongHomeStreak', 'LongRoadStreak', 'strLongRoadStreak',
       'LongWinStreak', 'LongLossStreak', 'CurrentHomeStreak',
       'strCurrentHomeStreak', 'CurrentRoadStreak', 'strCurrentRoadStreak',
       'CurrentStreak', 'strCurrentStreak', 'ConferenceGamesBack',
       'DivisionGamesBack', 'ClinchedConferenceTitle', 'ClinchedDivisionTitle',
       'ClinchedPlayoffBirth', 'EliminatedConference', 'EliminatedDivision',
       'AheadAtHalf', 'BehindAtHalf', 'TiedAtHalf', 'AheadAtThird',
       'BehindAtThird', 'TiedAtThird', 'Score100PTS', 'OppScore100PTS',
       'OppOver500', 'LeadInFGPCT', 'LeadInReb',