#### This file trains a logisitc Regression model using the data provided from Kaggle and the Ken Pomeroys web scraped data

Import logistic regression from sklearn

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.utils import shuffle

clf = LogisticRegressionCV(cv = 10, Cs=8, n_jobs=4, scoring="neg_log_loss")

Import data set as a csv file for pandas to manipulate and train logisitic regression model

In [None]:
import pandas as pd

kaggleData = "input/"
df_predict = pd.read_csv(kaggleData + "Predictions.csv")

In [None]:
X_train = df_predict[["AdjEM", "AdjO", "AdjD", "AdjT"]]
y_train = df_predict["Result"]

X_train, y_train = shuffle(X_train, y_train)
clf.fit(X_train, y_train)

In [None]:
df_test = pd.read_csv(kaggleData + "SampleSubmissionStage2.csv")
df_2018 = pd.read_csv(kaggleData + "2018.csv")
df_test["AdjEM"] = -1
df_test["AdjO"] = -1
df_test["AdjD"] = -1
df_test["AdjT"] = -1

In [None]:
for i in range(df_test.shape[0]):
    row = df_test.loc[i]
    year, team1, team2 = row.ID.split("_")
    team1, team2 = int(team1), int(team2)
    team1stats = df_2018.loc[df_2018["TeamID"] == team1]
    team2stats = df_2018.loc[df_2018["TeamID"] == team2]
    df_test["AdjEM"].loc[i] = team1stats.AdjEM.values[0] - team2stats.AdjEM.values[0]
    df_test["AdjO"].loc[i] = team1stats.AdjO.values[0] - team2stats.AdjO.values[0]
    df_test["AdjD"].loc[i] = team1stats.AdjD.values[0] - team2stats.AdjD.values[0]
    df_test["AdjT"].loc[i] = team1stats.AdjT.values[0] - team2stats.AdjT.values[0]

In [None]:
X_test = df_test[["AdjEM", "AdjO", "AdjD", "AdjT"]]
predictions = clf.predict_proba(X_test)

In [None]:
df_submission = pd.read_csv(kaggleData + "SampleSubmissionStage2.csv")

In [None]:
df_submission["Pred"] = predictions
df_submission.to_csv(kaggleData + "realsubmission1.csv", index=False)

lets train another logisitic regression model with an additional feature, that is, home court advantage.

In [None]:
df_advantage = df_predict.copy()
df_advantage.drop(["Unnamed: 0", "Season"], axis=1, inplace=True)

In [None]:
for i in range(df_advantage.shape[0]):
    if df_advantage["WLoc"].loc[i] == "N":
        df_advantage["WLoc"].loc[i] = 0
    elif df_advantage["WLoc"].loc[i] == "H":
        df_advantage["WLoc"].loc[i] = 1
    elif df_advantage["WLoc"].loc[i] == "A":
        df_advantage["WLoc"].loc[i] = -1
        
    if i % 10000 == 0:
        print(str(i) + " iterations")

In [None]:
df_advantage.to_csv(kaggleData + "features.csv")

In [None]:
df_advantage = pd.read_csv(kaggleData + "features.csv")
clf = LogisticRegressionCV(cv=10, Cs=8, n_jobs=2, scoring="neg_log_loss")
X_train = df_advantage[["AdjEM", "AdjO", "AdjD", "AdjT", "WLoc"]]
y_train = df_advantage["Result"]
clf.fit(X_train, y_train)

In [None]:
for i in range(df_test.shape[0]):
    row = df_test.loc[i]
    year, team1, team2 = row.ID.split("_")
    team1, team2 = int(team1), int(team2)
    team1stats = df_2018.loc[df_2018["TeamID"] == team1]
    team2stats = df_2018.loc[df_2018["TeamID"] == team2]
    df_test["AdjEM"].loc[i] = team1stats.AdjEM.values[0] - team2stats.AdjEM.values[0]
    df_test["AdjO"].loc[i] = team1stats.AdjO.values[0] - team2stats.AdjO.values[0]
    df_test["AdjD"].loc[i] = team1stats.AdjD.values[0] - team2stats.AdjD.values[0]
    df_test["AdjT"].loc[i] = team1stats.AdjT.values[0] - team2stats.AdjT.values[0]

In [None]:
df_test.to_csv(kaggleData + "SampleSubmissionStage2WithFeatures.csv")

In [None]:
df_test["WLoc"] = [1] * 2278

In [None]:
X_test = df_test[["AdjEM", "AdjO", "AdjD", "AdjT", "WLoc"]]
predictions = clf.predict_proba(X_test)
df_submit = pd.read_csv(kaggleData + "SampleSubmissionStage2.csv")
df_submit["Pred"] = predictions

In [None]:
df_submit.to_csv(kaggleData + "realsubmission2.csv", index=False)

In [None]:
df_test["WLoc"] = [-1] * 2278
X_test = df_test[["AdjEM", "AdjO", "AdjD", "AdjT", "WLoc"]]
predictions = clf.predict_proba(X_test)
df_submit = pd.read_csv(kaggleData + "SampleSubmissionStage2.csv")
df_submit["Pred"] = predictions
df_submit.to_csv(kaggleData + "realsubmission2.csv", index=False)