In [1]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
import math
import random

In [2]:
base_elo = 1600
teams_elo = {} 
teams_stat = {}
X = []
y = []
folder = 'data' 

In [3]:
def initialize_data(miscellaneous_stat, opponent_stat, team_stat):
    temp_miscellaneous_stat = miscellaneous_stat.drop(['Rk', 'Arena'], axis=1)
    temp_opponent_stat = opponent_stat.drop(['Rk', 'G', 'MP'], axis=1)
    temp_team_stat = team_stat.drop(['Rk', 'G', 'MP'], axis=1)

    temp_teams_stat = pd.merge(temp_miscellaneous_stat, temp_opponent_stat, how='left', on='Team')
    temp_teams_stat = pd.merge(temp_teams_stat, temp_team_stat, how='left', on='Team')
    return temp_teams_stat.set_index('Team', inplace=False, drop=True) 

In [4]:
def get_elo(team):
    try:
        return teams_elo[team]
    except:
        teams_elo[team] = base_elo
        return teams_elo[team]

In [5]:
def calculate_elo(win_team, lose_team):
    win_team_elo = get_elo(win_team)
    lose_team_elo = get_elo(lose_team)

    win_team_expected = 1 / (1 + math.pow(10, (lose_team_elo - win_team_elo) / 400))
    lose_team_expected = 1 / (1 + math.pow(10, (win_team_elo - lose_team_elo) / 400))
    
    if win_team_elo < 2100:
        k = 32
    elif win_team_elo >= 2100 and win_team_elo < 2400:
        k = 24
    else:
        k = 16
    
    win_team_elo = round(win_team_elo + (k * (1 - win_team_expected)))      
    lose_team_elo = round(lose_team_elo + (k * (0 - lose_team_expected)))
    return win_team_elo, lose_team_elo

In [6]:
def  build_data_set(datas):
    print("正在建立數據集...")
    X = []
    for index, row in datas.iterrows():

        win_team = row['WTeam']
        lose_team = row['LTeam']

        win_team_elo = get_elo(win_team)
        lose_team_elo = get_elo(lose_team)

        if row['WLoc'] == 'H':
            win_team_elo += 100
        else:
            lose_team_elo += 100

        win_team_features = [win_team_elo]
        lose_team_features = [lose_team_elo]

        for key, value in teams_stat.loc[win_team].iteritems():
            win_team_features.append(value)
        for key, value in teams_stat.loc[lose_team].iteritems():
            lose_team_features.append(value)

        # 使y服从伯努利分布
        if random.random() > 0.5:
            X.append(win_team_features + lose_team_features)
            y.append(0)
        else:
            X.append(lose_team_features + win_team_features)
            y.append(1)

        temp_win_team_elo, temp_lose_team_elo = calculate_elo(win_team, lose_team)
        teams_elo[win_team] = temp_win_team_elo
        teams_elo[lose_team] = temp_lose_team_elo

    return np.nan_to_num(X), y

In [7]:
def predict_winner(visit_team, home_team, model):
    features = []

    features.append(get_elo(visit_team))
    for key, value in teams_stat.loc[visit_team].iteritems():
        features.append(value)

    features.append(get_elo(home_team) + 100)
    for key, value in teams_stat.loc[home_team].iteritems():
        features.append(value)

    features = np.nan_to_num(features)
    return model.predict_proba([features])

In [8]:
if __name__ == '__main__':
    miscellaneous_stat = pd.read_csv(folder + '/17-18_Miscellaneous_Stat.csv')
    opponent_stat = pd.read_csv(folder + '/17-18_Opponent_Per_Game_Stat.csv')
    team_stat = pd.read_csv(folder + '/17-18_Team_Per_Game_Stat.csv')

    teams_stat = initialize_data(miscellaneous_stat, opponent_stat, team_stat)

    result_data = pd.read_csv(folder + '/17-18_Result.csv')
    X, y = build_data_set(result_data)

    print('正在以{:d} 場比賽案例訓練網路模型...'.format(len(X)))
    model = linear_model.LogisticRegression(solver='liblinear')
    model.fit(X, y)

    print('正在進行十折交叉驗證...')
    print('计算出准确率为：{:.2f}%'.format(cross_val_score(model, X, y, cv = 10, scoring='accuracy', n_jobs=-1).mean()*100))
    
    print('正在預測新賽季比賽結果...')
    schedule = pd.read_csv(folder + '/18-19_Schedule.csv')
    predict_result = []
    for index, row in schedule.iterrows():
        visit_team = row['Vteam']
        home_team = row['Hteam']
        predict = predict_winner(visit_team, home_team, model)
        probability = predict[0][0]
        if probability > 0.5:
            predict_result.append([visit_team, home_team, probability, 'V'])
        else:
            predict_result.append([home_team, visit_team, 1 - probability, 'H'])

    with open(folder + '/18-19_Result_Predict.csv', 'w') as file:
        writer = csv.writer(file)
        writer.writerow(['win_team', 'lose_team', 'probability', 'WLoc'])
        writer.writerows(predict_result)
        print('done.')

正在建立數據集...
正在以104 場比賽案例訓練網路模型...
正在進行十折交叉驗證...
计算出准确率为：56.82%
正在預測新賽季比賽結果...
done.


In [9]:
predict_result = pd.read_csv(folder + '/18-19_Result_Predict.csv',header=0)
predict_result

Unnamed: 0,win_team,lose_team,probability,WLoc
0,Boston Celtics,Philadelphia 76ers,0.598707,H
1,Oklahoma City Thunder,Golden State Warriors,0.599915,V
2,Milwaukee Bucks,Charlotte Hornets,0.700497,V
3,Detroit Pistons,Brooklyn Nets,0.692479,H
4,New Orleans Pelicans,Houston Rockets,0.554512,V
5,Indiana Pacers,Memphis Grizzlies,0.799613,H
6,Los Angeles Clippers,Denver Nuggets,0.703627,H
7,Atlanta Hawks,New York Knicks,0.577445,V
8,Orlando Magic,Miami Heat,0.725583,H
9,Phoenix Suns,Dallas Mavericks,0.955962,H


In [10]:
actual_result = pd.read_csv(folder + '/18-19_Result.csv',header=0)
compare = pd.concat([actual_result['WLoc'], predict_result['WLoc']], axis = 1, ignore_index=True)
compare.columns = ['actual_team', 'predict_team']
compare['is_accurate'] = ''
total_count = 0
true_count = 0
for index, row in compare.iterrows():
    total_count += 1
    predict_team = row['predict_team']
    actual_team = row['actual_team']
    if predict_team == actual_team:
        true_count += 1
        row['is_accurate'] = 'T'
    else:
        row['is_accurate'] = 'F'

accuracy = true_count / total_count * 100
print('实际准确率为：{:.2f}%'.format(accuracy))
compare

实际准确率为：67.27%


Unnamed: 0,actual_team,predict_team,is_accurate
0,H,H,T
1,H,V,F
2,V,V,T
3,H,H,T
4,V,V,T
5,H,H,T
6,V,H,F
7,H,V,F
8,H,H,T
9,H,H,T
