In [1]:
import json
import pandas as pd
import numpy as np

file = 'stats.json'

In [2]:
with open(file) as f:
    data = json.load(f)

In [3]:
sample = pd.DataFrame(data['North Carolina Central']['Team'], index = [0])

In [4]:
cols = (sample.keys())

In [5]:
teams = list(data.keys())

In [6]:
df_team = pd.DataFrame(columns=cols)
index = 0

for i in teams:
    row = data.get(i)
    df_team = df_team.append(pd.DataFrame(row['Team'],index = [index]))
    index += 1
    
df_op = pd.DataFrame(columns=cols)
index = 0

for i in teams:
    row = data.get(i)
    df_op = df_op.append(pd.DataFrame(row['Opponent'],index = [index]))
    index += 1

In [7]:
df_team['Team'] = pd.DataFrame(teams)
df_op['Team'] = pd.DataFrame(teams)

In [8]:
df_team.to_csv(r'Team_stats.csv')
df_op.to_csv(r'Opp_stats.csv')

In [9]:
games = pd.read_csv('games.csv')
games.head()

Unnamed: 0,team1,team2,round,winner
0,Belmont,Temple,1,1
1,North Dakota State,North Carolina Central,1,1
2,Arizona State,St. John's (NY),1,1
3,Prairie View,Fairleigh Dickinson,1,0
4,Duke,North Dakota State,2,1


In [10]:
cols2 = list(cols)
cols2.append('Team')
full = pd.DataFrame()

team1_team = pd.DataFrame()
team2_team = pd.DataFrame()
team1_opp = pd.DataFrame()
team2_opp = pd.DataFrame()

index = 0
for i in games.T:
    
    team1 = games['team1'][i]
    team2 = games['team2'][i]
    
    indext1 = df_team.index[df_team['Team'] == team1]
    indext2 = df_team.index[df_team['Team'] == team2]
    indexo1 = df_team.index[df_op['Team'] == team1]
    indexo2 = df_team.index[df_op['Team'] == team2]
    team1_team = team1_team.append(df_team.iloc[indext1].add_prefix('t1t_'),ignore_index = True)
    team2_team = team2_team.append(df_team.iloc[indext2].add_prefix('t2t_'),ignore_index=True)
    team1_opp = team1_opp.append(df_op.iloc[indexo1].add_prefix('t1o_'),ignore_index = True)
    team2_opp = team2_opp.append(df_op.iloc[indexo2].add_prefix('t2o_'),ignore_index=True)
    
full = pd.concat([games, team1_team,team1_opp,team2_team,team2_opp], axis = 1)

full = full.drop(columns = ['t1t_Team', 't1o_Team', 't2t_Team', 't2o_Team'])

In [11]:
full['round'] = full['round'].astype('category')
full['winner'] = full['winner'].astype('category')

In [13]:
full.to_csv(r'Full_stats.csv')

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics

y = full['winner']
X = full.loc[:, 't1t_g':].astype('float')

X_stand = pd.DataFrame(preprocessing.scale(X))
X_norm = pd.DataFrame(preprocessing.normalize(X, axis = 0))

model = LogisticRegression()
model.fit(X, y)

score = model.score(X,y)
print(score)

1.0




In [15]:
predictions = model.predict(X)

In [16]:
cm = metrics.confusion_matrix(y, predictions)
print(cm)

[[29  0]
 [ 0 38]]


In [17]:
intercept = pd.DataFrame(model.intercept_, columns = ['intercept'])

In [18]:
coefs = model.coef_[0]

In [19]:
results = pd.DataFrame(coefs).T

In [20]:
col_rename_dict = {i:j for i,j in zip(results.columns,X.columns)}

In [21]:
results = results.rename(columns = col_rename_dict)

In [22]:
results = pd.concat([intercept, results], axis = 1)

In [23]:
results.to_csv(r'Results.csv')

In [24]:
predictions

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0], dtype=int64)

In [25]:
model.coef_

array([[-1.32132261e-03,  8.51587478e-03,  1.68377033e-02,
        -1.12488376e-02, -9.49567735e-05,  5.31082695e-02,
        -5.84758276e-02, -8.71159840e-05, -3.62705661e-02,
         4.72269900e-02, -1.35359905e-04,  3.69158584e-02,
        -2.85536101e-02, -7.17038255e-05,  6.60226986e-02,
        -3.88758385e-02,  2.71468600e-02, -5.68999383e-02,
         6.22991092e-02,  1.49425299e-01,  7.10858734e-02,
         6.31000250e-02,  3.10073274e-02, -1.60064700e-02,
        -1.32132261e-03,  8.51587478e-03,  1.83396283e-02,
        -2.28768692e-02, -7.78938602e-05,  2.92893433e-02,
        -1.33009464e-02, -5.78004541e-05, -1.09497150e-02,
        -9.57592279e-03, -7.99963753e-05, -5.07125090e-02,
         7.10297669e-03, -2.58405307e-04,  2.08137107e-02,
        -3.53219535e-02, -1.45082428e-02, -1.54366683e-01,
         1.31797916e-02, -6.14050160e-03, -6.45143252e-03,
        -1.03844717e-01, -2.50739928e-02, -1.49879706e-02,
        -5.65472159e-04,  1.32828672e-03,  8.71508844e-0

In [26]:
for feature in list(cols):
    full['t_'+feature] = (full['t1t_'+feature].astype('float') - full['t2t_'+feature].astype('float'))
    full['o_'+feature] = (full['t2t_'+feature].astype('float') - full['t2o_'+feature].astype('float'))

In [27]:
full.to_csv(r'Full_stats.csv')

In [28]:
y2 = full['winner']
X2 = full.loc[:, 't_fg':].astype('float')

#X_stand = pd.DataFrame(preprocessing.scale(X))
#X_norm = pd.DataFrame(preprocessing.normalize(X, axis = 0))

model2 = LogisticRegression()
model2.fit(X2, y2)

score2 = model2.score(X2,y2)
print(score2)

0.9701492537313433




In [30]:
predictions2 = model2.predict(X2)
cm2 = metrics.confusion_matrix(y2, predictions2)
print(cm2)

[[27  2]
 [ 0 38]]


In [31]:
games['pred2'] = (y2 == predictions2)
games.to_csv(r'games_pred.csv')

In [32]:
coefs2 = model2.coef_[0]
results2 = pd.DataFrame(coefs2).T
intercept2 = pd.DataFrame(model2.intercept_, columns = ['intercept'])
col_rename_dict2 = {i:j for i,j in zip(results2.columns,X2.columns)}
results2 = results2.rename(columns = col_rename_dict2)
results2 = pd.concat([intercept2, results2], axis = 1)
results2.to_csv(r'Results2.csv')