In [None]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from subprocess import check_output

In [None]:
data_dir = 'DataFiles/'
df_seeds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')
df_tour = pd.read_csv(data_dir + 'NCAATourneyCompactResults.csv')

In [None]:
def seed_to_int(seed):
    s_int = int(seed[1:3])
    return s_int
df_seeds['seed_int'] = df_seeds.Seed.apply(seed_to_int)
df_seeds.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label
df_seeds.head()

In [None]:
df_tour.drop(labels=['DayNum','WLoc', 'NumOT'], inplace=True, axis=1)
df_tour.head()

In [None]:
df_winseeds = df_seeds.rename(columns={'TeamID':'WTeamID', 'seed_int':'WSeed'})
df_lossseeds = df_seeds.rename(columns={'TeamID':'LTeamID', 'seed_int':'LSeed'})
df_dummy = pd.merge(left=df_tour, right=df_winseeds, how='left', on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy, right=df_lossseeds, on=['Season', 'LTeamID'])
df_concat['seedDiff'] = df_concat.WSeed - df_concat.LSeed
df_concat['scoreDiff'] = df_concat.WScore - df_concat.LScore
df_concat.head()

In [None]:
df_predictions = pd.DataFrame()
df_predictions['scoreDiff'] = df_concat['scoreDiff']
df_predictions['seedDiff'] = df_concat['seedDiff']
df_predictions

In [None]:
X_train = df_predictions.seedDiff.values.reshape(-1,1)
y_train = df_predictions.scoreDiff.values
X_train, y_train = shuffle(X_train, y_train)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
plt.scatter(X_train, y_train)
plt.xlabel('Team1 seed - Team2 seed')
plt.ylabel('Score Diff')
plt.show()

In [None]:
preds = model.predict(X_train)

plt.scatter(preds, y_train)
plt.xlabel('Score Diff')
plt.ylabel('Predicted Score Diff')
plt.show()

In [None]:
df_sample_sub = pd.read_csv(data_dir + 'SampleSubmissionStage1.csv')
n_test_games = len(df_sample_sub)

def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))

In [None]:
X_test = np.zeros(shape=(n_test_games, 1))
for ii, row in df_sample_sub.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)
    t1_seed = df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].seed_int.values[0]
    t2_seed = df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].seed_int.values[0]
    diff_seed = t1_seed - t2_seed
    X_test[ii, 0] = diff_seed

In [None]:
preds = model.predict(X_test)
preds = preds/preds.max()
clipped_preds = np.clip(preds, 0.05, 0.95)
df_sample_sub.Pred = clipped_preds
df_sample_sub.head()

In [None]:
df_sample_sub.to_csv('scorediff_as_prob.csv', index=False)