In [18]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.parser import parse

In [19]:
def weight_function(statistic, weight):
    s = 0
    if type(statistic) == np.ndarray:
        for i in range(len(statistic)):
            s += statistic[len(statistic) - i - 1] * (weight ** i)
    else:
        for i in range(len(statistic)):
            s += statistic.iloc[len(statistic) - i - 1,] * (weight ** i)
    return s

In [20]:
files = pd.read_csv("all_games_updated.csv")
def get_stats(player, date, number_rows):
    converted_datetime = datetime.strptime(date, '%Y-%m-%d')
    player_rows = files.loc[files['name'] == player]
    selected_rows = []
    if (len(player_rows)) < number_rows:
        for i in range(len(player_rows)):
            selected_rows.append(player_rows.iloc[i])
        return pd.DataFrame(selected_rows)
    index = 0
    for i in range(len(player_rows)):
        curr_date = player_rows.iloc[i]['Date']
        if datetime.strptime(curr_date, '%Y-%m-%d') >= converted_datetime:
            index = i - 1
            break
    if index != 0:
        if index + 1 - number_rows < 0:
            for i in range(index + 1):
                selected_rows.append(player_rows.iloc[i])
        else:
            for i in range(index + 1 - number_rows, index + 1):
                selected_rows.append(player_rows.iloc[i])
#     elif converted_datetime >= players_rows[len(player_rows) - 1]:
    else:
        for i in range(len(player_rows) - number_rows, len(player_rows)):
            selected_rows.append(player_rows.iloc[i])
    return pd.DataFrame(selected_rows)

In [21]:
def time_weighted_average(rows, statistic, weight):
    if rows.empty:
        return 0
    stat = rows[statistic]
    this_num = 1 / weight_function(np.ones(len(stat)), weight)
    return this_num * weight_function(stat, weight)

In [22]:
player_box_scores = pd.read_csv('all_games_updated.csv')

input_statistics = ["name", "team", "date", "location", "opponent", "made_field_goals", "made_two_point_field_goals", "attempted_two_point_field_goals", "attempted_field_goals", "made_three_point_field_goals", "attempted_three_point_field_goals", "attempted_free_throws", "made_free_throws", "offensive_rebounds", "defensive_rebounds", "assists", "blocks", "turnovers", "steals"]

In [23]:
def statline_predictor(player_box_scores, input_statistics, sample_size = 5, weight = .8):
    predicted_statlines = pd.DataFrame(index = player_box_scores.index, columns = input_statistics).fillna(0).T
    for box_index in player_box_scores.index:
        box_score = player_box_scores.loc[box_index]
        player_name = box_score["name"]
        game_date = box_score["Date"]
        last_n_rows = get_stats(player_name, game_date, sample_size)
        weighted_stats = [player_name, box_score["team"], game_date, box_score["location"], box_score["opponent"]]
        for stat in input_statistics[5:]:
            weighted_stats.append(round(time_weighted_average(last_n_rows, stat, weight), 2))
        predicted_statlines[box_index] = weighted_stats
    return predicted_statlines.T

In [106]:
predictions_samplesize5 = statline_predictor(files, input_statistics, sample_size = 5)
predictions_samplesize5.to_csv("predicted_statlines_trailing_5_games.csv")

In [107]:
predictions_samplesize10 = statline_predictor(files, input_statistics, sample_size = 10)
predictions_samplesize10.to_csv("predicted_statlines_trailing_10_games.csv")

In [24]:
def rebound_predictor_5(weighted_stats):
    return 3.495 + .9489 * weighted_stats["offensive_rebounds_y"] + .2874 * weighted_stats["defensive_rebounds_y"]

In [48]:
def freethrow_predictor_5(weighted_stats):
    return 1.373 + .6217 * weighted_stats["attempted_free_throws_y"] + -.0373 * weighted_stats["attempted_field_goals_y"] -.3289 * (weighted_stats["location_y"] == "HOME")

In [26]:
def twopt_predictor_5(weighted_stats):
    return max(0, -5.395 + .6064 * weighted_stats["attempted_two_point_field_goals_y"] - .2542 * weighted_stats["attempted_field_goals_y"] + .071 * weighted_stats["Opponent Defensive Rating"])

In [72]:
def threept_predictor_5(weighted_stats):
    return .766 + .214 * weighted_stats["attempted_three_point_field_goals_y"]

In [28]:
def assist_predictor_5(weighted_stats):
    return 2.06 + .532 * weighted_stats["assists_y"] - .064 * weighted_stats["blocks_y"]

In [71]:
def block_predictor_5(weighted_stats):
    return .329 + .182 * weighted_stats["offensive_rebounds_y"] - .038 * weighted_stats["defensive_rebounds_y"] + .424 * weighted_stats["blocks_y"]

In [39]:
def steal_predictor_5(weighted_stats):
    return max(0, -.936 + .339 * weighted_stats["steals_y"] + .129 * weighted_stats["Opponent Turnover %"])

In [31]:
def turnover_predictor_5(weighted_stats):
    return max(0, -2.589 + .18 * weighted_stats["turnovers_y"] + .121 * weighted_stats["assists_y"] + .032 * weighted_stats["Opponent Defensive Rating"])

In [32]:
weighted_preds = pd.read_csv("true_and_predicted_statlines_5.csv")

In [73]:
output_statistics = ["name", "team", "date", "location", "opponent", "made_two_point_field_goals", "made_three_point_field_goals", "made_free_throws", "rebounds", "assists", "blocks", "steals", "turnovers"]

def statline_output(weighted_predicted_lines):
    output_statlines = pd.DataFrame(index = weighted_predicted_lines.index, columns = output_statistics).fillna(0).T
    for box_index in weighted_predicted_lines.index:
        weighted_line = weighted_predicted_lines.loc[box_index]
        output_stats = [weighted_line["name_x"], weighted_line["team_x"], weighted_line["Date"], weighted_line["location_x"], weighted_line["opponent_x"]]
        output_stats.append(twopt_predictor_5(weighted_line))
        output_stats.append(threept_predictor_5(weighted_line))
        output_stats.append(freethrow_predictor_5(weighted_line))
        output_stats.append(rebound_predictor_5(weighted_line))
        output_stats.append(assist_predictor_5(weighted_line))
        output_stats.append(block_predictor_5(weighted_line))
        output_stats.append(steal_predictor_5(weighted_line))
        output_stats.append(block_predictor_5(weighted_line))
        output_statlines[box_index] = output_stats
    return output_statlines.T

In [74]:
output_picks = statline_output(weighted_preds)
output_picks.iloc[5010:5020,]

Unnamed: 0,name,team,date,location,opponent,made_two_point_field_goals,made_three_point_field_goals,made_free_throws,rebounds,assists,blocks,steals,turnovers
5010,Deng Adel,CLEVELAND CAVALIERS,2019-02-02,HOME,DALLAS MAVERICKS,2.47777,0.97144,0.981063,3.99235,2.32232,0.52084,0.76671,0.52084
5011,Treveon Graham,BROOKLYN NETS,2019-02-02,AWAY,ORLANDO MAGIC,2.5771,1.24536,2.16754,4.83448,2.94076,0.32576,0.74148,0.32576
5012,Corey Brewer,PHILADELPHIA 76ERS,2019-02-02,AWAY,SACRAMENTO KINGS,3.89274,1.28174,2.56444,4.42751,2.7256,0.4243,1.25262,0.4243
5013,Jawun Evans,PHOENIX SUNS,2019-02-02,HOME,ATLANTA HAWKS,3.03706,0.766,1.00755,3.68468,2.37388,0.30392,0.99684,0.30392
5014,Quincy Pondexter,SAN ANTONIO SPURS,2019-02-02,HOME,NEW ORLEANS PELICANS,2.78847,0.8302,1.1907,3.77816,2.2196,0.33768,0.6894,0.33768
5015,Quinn Cook,GOLDEN STATE WARRIORS,2019-02-02,HOME,LOS ANGELES LAKERS,2.54784,1.35664,1.03553,4.15096,2.89524,0.31606,0.87396,0.31606
5016,Willy Hernangomez,CHARLOTTE HORNETS,2019-02-02,HOME,CHICAGO BULLS,3.89,0.80666,1.92555,5.7228,2.35936,0.6363,0.7023,0.6363
5017,Tobias Harris,LOS ANGELES CLIPPERS,2019-02-02,AWAY,DETROIT PISTONS,4.91543,1.85098,4.43839,5.97463,5.09632,0.47828,0.69138,0.47828
5018,Marcus Derrickson,GOLDEN STATE WARRIORS,2019-02-02,HOME,LOS ANGELES LAKERS,2.16391,1.31812,1.11884,4.621,2.23,0.59656,0.7926,0.59656
5019,Mike Muscala,PHILADELPHIA 76ERS,2019-02-02,AWAY,SACRAMENTO KINGS,2.42688,1.62842,1.80808,5.9706,2.50188,0.79498,0.5475,0.79498


In [76]:
full_test_output = output_picks[output_picks["date"] > "2019-02-01"]

In [78]:
full_test_output.to_csv("predicted_statlines_feb_mar_2019.csv")