In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
pd.pandas.set_option('display.max_columns', None)

In [98]:
df = pd.read_csv('./raw_data/all_seasons.csv')
df.drop(columns=df.columns[0], inplace=True)
df.head()
df.tail()

Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,draft_number,gp,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
12300,Markieff Morris,MIA,32.0,205.74,111.13004,Kansas,USA,2011,1,13,17,7.6,2.6,1.4,4.5,0.059,0.089,0.197,0.547,0.116,2021-22
12301,Markelle Fultz,ORL,24.0,193.04,94.800728,Washington,USA,2017,1,1,18,10.8,2.7,5.5,-5.3,0.01,0.116,0.265,0.517,0.448,2021-22
12302,Marcus Smart,BOS,28.0,193.04,99.79024,Oklahoma State,USA,2014,1,6,71,12.1,3.8,5.9,9.3,0.018,0.093,0.179,0.54,0.245,2021-22
12303,Marcus Garrett,MIA,23.0,195.58,92.98636,Kansas,USA,Undrafted,Undrafted,Undrafted,12,1.1,1.9,0.6,5.8,0.072,0.108,0.086,0.28,0.069,2021-22
12304,Micah Potter,DET,24.0,208.28,112.490816,Wisconsin,USA,Undrafted,Undrafted,Undrafted,3,4.0,3.0,0.0,-56.4,0.095,0.125,0.148,0.505,0.0,2021-22


In [327]:
# get the every unique player according to name,weight and height
unique_values_player = df[["player_name","player_height","player_weight"]].drop_duplicates()
unique_values_player = unique_values_player.reset_index(drop=True)
unique_values_player.reset_index(inplace=True)
unique_values_player.rename(columns={"index":"ID"},inplace=True)
# connect the dataframe by the player
merged_df = pd.merge(unique_values_player, df, on=["player_name","player_height","player_weight"])
merged_df.drop(columns=["player_name","player_height","player_weight","draft_year","draft_round","draft_number"], inplace=True)
merged_df.drop(columns=["team_abbreviation","college","country"],inplace=True)
filtered_df = merged_df.groupby('ID').filter(lambda x:len(x)>=3)
filtered_df.sort_values(by=["ID","season"],ascending=[True,True])
filtered_df.drop(columns=["season"],inplace=True)
filtered_df["season_no"] = filtered_df.groupby("ID").cumcount()+1
filtered_df["total_season"] = filtered_df.groupby("ID")["ID"].transform("count")
dataset = filtered_df.reset_index(drop=True)
print(dataset)



        ID   age  gp   pts  reb  ast  net_rating  oreb_pct  dreb_pct  usg_pct  \
0        5  38.0  52   8.2  2.7  1.0         4.1     0.034     0.126    0.220   
1        5  39.0  75   8.4  2.0  1.2        -3.9     0.041     0.084    0.202   
2        5  40.0   3   4.0  0.7  0.3       -20.8     0.000     0.154    0.292   
3        8  29.0  71   5.7  1.6  1.3        -0.3     0.036     0.076    0.172   
4        8  30.0  59   9.3  2.4  2.0        -1.9     0.027     0.087    0.186   
...    ...   ...  ..   ...  ...  ...         ...       ...       ...      ...   
8680  4061  22.0  58  15.3  2.6  1.4        -1.8     0.014     0.069    0.204   
8681  4061  23.0  70  18.3  2.7  2.0         3.3     0.010     0.070    0.212   
8682  4062  27.0  66   5.9  1.9  0.7        -1.1     0.015     0.117    0.163   
8683  4062  28.0  72   6.9  2.4  0.8        14.6     0.024     0.117    0.169   
8684  4062  29.0  76   9.2  2.7  1.3         1.9     0.014     0.097    0.160   

      ts_pct  ast_pct  seas

In [134]:
train_ratio = 0.8
test_ratio = 1-train_ratio
data_id = dataset[["ID"]].drop_duplicates().values.flatten()
n = len(data_id)
np.random.shuffle(data_id)
train_id = data_id[n*train_ratio]

[1687 2935 2063 ... 3424 1235 2362]


In [328]:
def data_augmentation(data,subset_rows=5):
    subblocks = []
    cnt = 0
    subset_rows = 5
    for attribute,group in dataset.groupby("ID"):
        len_group = len(group)
        group = group.reset_index(drop=True)
        if len_group > subset_rows:
            for i in range(len_group-subset_rows+1):
                subgroup = group.loc[i:i+subset_rows-1].copy()
                subgroup["block_id"] = cnt
                subblocks.append(subgroup)
                cnt += 1
        else:       
            group["block_id"] = cnt
            subblocks.append(group)
            cnt += 1
    augmented_data = pd.concat(subblocks).reset_index(drop=True)
    return augmented_data
augmented_data = data_augmentation(dataset,5)

def splite_data(augmented_data,column):
    train_ratio = 0.8
    test_ratio = 1-train_ratio
    data_id = augmented_data[[column]].drop_duplicates().values.flatten()
    n = len(data_id)
    np.random.shuffle(data_id)
    train_ids = data_id[0:int(n*train_ratio)]
    test_ids = data_id[int(n*train_ratio):]
    train_set = [augmented_data[augmented_data[column] == train_id] for train_id in train_ids]
    train_set = pd.concat(train_set)
    test_set = [augmented_data[augmented_data[column] == test_id] for test_id in test_ids]
    test_set = pd.concat(test_set)
    train_set.to_csv(f'./dataset/trainset_{column}.csv', index=False)
    test_set.to_csv(f'./dataset/testset_{column}.csv', index=False)
splite_data(augmented_data,"block_id")
splite_data(dataset,"ID")

In [301]:
train_set.to_csv('./dataset/trainset.csv', index=False)
test_set.to_csv('./dataset/testset.csv', index=False)

In [192]:
print(unique_values_player[unique_values_player['ID'] == 1935])

        ID player_name  player_height  player_weight
1935  1935    CJ Miles         198.12       99.79024


In [329]:
column = "block_id"
# train_data = pd.read_csv("./player_data_train.csv",index_col=False)
# test_data = pd.read_csv("./player_data_test.csv",index_col=False)
# train_data = pd.read_csv("./dataset/trainset_ID.csv",index_col=False)
# test_data = pd.read_csv("./dataset/testset_ID.csv",index_col=False)
train_data = pd.read_csv(f"./dataset/trainset_{column}.csv",index_col=False)
test_data = pd.read_csv(f"./dataset/testset_{column}.csv",index_col=False)
def get_X(df):
    return pd.DataFrame(df[0:-1])
def get_y(df):
    return pd.DataFrame(df[len(df)-1:len(df)])
X_train = train_data.groupby(column).apply(lambda x:get_X(x)).reset_index(drop=True)
y_train = train_data.groupby(column).apply(lambda x:get_y(x)).reset_index(drop=True)
X_test = test_data.groupby(column).apply(lambda x:get_X(x)).reset_index(drop=True)
y_test = test_data.groupby(column).apply(lambda x:get_y(x)).reset_index(drop=True)

def build_features(df):
    age = df.age.values[-1]
    gp_mean = np.mean(df.gp.values)
    pts_mean = np.mean(df.pts.values)
    pts_last1year= df.pts.values[-1]
    pts_last2year = df.pts.values[-2]
    net_mean = np.mean(df.net_rating.values)
    ts_mean = np.mean(df.ts_pct.values)
    usg_mean = np.mean(df.usg_pct.values)
    n_season = df['season_no'].values[-1]
    data = np.array([[age, gp_mean, pts_mean, pts_last1year, pts_last2year, net_mean,ts_mean, usg_mean, n_season]])
    return pd.DataFrame(data)


df_feature_train = X_train.sort_values([column],ascending=[True]).groupby(column).apply(lambda x: build_features(x))
x_feature_train = df_feature_train.values
labels_train = y_train.sort_values([column],ascending=[True]).pts.values

df_feature_test = X_test.sort_values([column],ascending=[True]).groupby(column).apply(lambda x: build_features(x))
x_feature_test = df_feature_test.values
labels_test = y_test.sort_values([column],ascending=[True]).pts.values




In [330]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_feature_train)
x_feature_train = scaler.transform(x_feature_train)
x_feature_test = scaler.transform(x_feature_test)
lr = LinearRegression()
lr.fit(x_feature_train, labels_train)

y_pred = lr.predict(x_feature_test)


mse = mean_squared_error(labels_test, y_pred)
print(mse)

10.097518989144522


In [336]:

from sklearn.svm import SVR
svr = SVR(kernel='rbf',C=100,gamma=0.01,epsilon=0.2)

svr.fit(x_feature_train, labels_train)

y_pred = svr.predict(x_feature_test)

mse = mean_squared_error(labels_test, y_pred)
print(mse)

9.364783517390666
