In [1]:
import pandas as pd
import numpy as np
import io
from helper import *

In [2]:
# Add the EWA data
EWA = pd.read_csv("EWA_data.csv", header=None)[[0, 2, 15]]
EWA.columns = ["Year", "Player", "EWA"]
EWA.drop([0], axis=0, inplace=True)

# Fix player stuff
EWA = EWA[pd.isnull(EWA["Player"])==False]
EWA = EWA[EWA["Player"]!="PLAYER"]
EWA["Player"] = formatNames([name.split(",")[0] for name in EWA["Player"]])

# Fix year stuff
EWA = EWA[EWA["Year"]!="Select Year"]
EWA = EWA[[True if "Regular Season" in year else False for year in EWA["Year"]]]
EWA["Year"] = [year[:4] for year in EWA["Year"]]

EWA.reset_index(drop=True, inplace=True)
EWA.drop_duplicates(inplace=True)

In [3]:
# Add the WS data
fileNames = ["WS/ws"+str(i)+".csv" for i in range(1,11)]
wsCols = ["Player", "Year", "WS"]
WS = pd.DataFrame(columns=wsCols)
for name in fileNames: 
    data = pd.read_csv(name)
    data.rename(columns={"Text1":"Player", "Text":"Data"}, inplace=True)
    data = data[pd.isnull(data["Data"])==False]
    for i in range(len(data)):
        row = data.iloc[i]
        name = row["Player"]
        tempData = pd.read_csv(io.StringIO(data.iloc[i]["Data"][80:]), lineterminator='\n')
        tempData = tempData[pd.isnull(tempData["Season"])==False]
        WSs = tempData["WS"].values
        years = [year[:4] for year in tempData["Season"]]
        for ws, year in zip(WSs, years):
            if year.isnumeric():
                WS = WS.append(pd.Series([name, year, ws], index=wsCols), ignore_index=True)
    
fileNames = ["WS/200"+str(i)+".csv" for i in range(0,10)]
for name in fileNames: 
    data = pd.read_csv(name)
    data.rename(columns={"Text":"Data", "Page_Title":"Player"}, inplace=True)
    data = data[pd.isnull(data["Data"])==False]
    data = data[pd.isnull(data["Player"])==False]
    data["Player"] = [s.replace(" Stats | Basketball-Reference.com", "") for s in data["Player"]]
    for i in range(len(data)):
        row = data.iloc[i]
        name = row["Player"]
        tempData = pd.read_csv(io.StringIO(data.iloc[i]["Data"][80:]), lineterminator='\n')
        tempData = tempData[pd.isnull(tempData["Season"])==False]
        WSs = tempData["WS"].values
        years = [year[:4] for year in tempData["Season"]]
        for ws, year in zip(WSs, years):
            if year.isnumeric():
                WS = WS.append(pd.Series([name, year, ws], index=wsCols), ignore_index=True)
    
WS.drop_duplicates(inplace=True)
WS = WS[pd.isnull(WS["Player"])==False]
WS["Player"] = formatNames(WS["Player"])

In [4]:
# Finally do it for WP
fileNames = ["WP/WP_{}.csv".format(2000+i) for i in range(0, 20)]
WP_data = [pd.read_csv(filename) for filename in fileNames]
for i in range(len(WP_data)):
    year = 2000+i
    data = WP_data[i][["NAME", "WP"]]
    data.rename(columns={"NAME":"Player"}, inplace=True)
    data['Year'] = year
    data["Player"] = formatNames(data["Player"])
    WP_data[i] = data

WP = pd.concat(WP_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [5]:
# Combine this all into one dataset
draftData = pd.read_pickle("featureData.df")

draftData["EWA"] = np.nan
draftData["WS"] = np.nan
draftData["WP"] = np.nan
draftData['did3years'] = False
draftData['didAnyYears'] = False

EWA["Year"] = [int(year) for year in EWA["Year"]]
EWA["EWA"] = [float(val) for val in EWA["EWA"]]
WS["Year"] = [int(year) for year in WS["Year"]]
WS["EWA"] = [float(val) for val in WS["WS"]]
WP["Year"] = [int(year) for year in WP["Year"]]
WP["WP"] = [float(val) for val in WP["WP"]]

for i in range(len(draftData)):
    player = draftData.iloc[i]["Player"]
    year = draftData.iloc[i]["Year"]
    
    playerEWA = EWA[EWA["Player"]==player]
    playerWS = WS[WS["Player"]==player]
    playerWP = WP[WP["Player"]==player]
    
    playerEWA = playerEWA[playerEWA["Year"] >= year].sort_values(by="Year")
    playerWS = playerWS[playerWS["Year"] >= year].sort_values(by="Year")
    playerWP = playerWP[playerWP["Year"] >= year].sort_values(by="Year")
    
#     if len(playerWS)==0: print(player, year)
    
    if len(playerEWA)>=3: # Take the first three years
        draftData["EWA"].iloc[i] = np.mean(playerEWA["EWA"].iloc[:3].values)
        draftData["did3years"].iloc[i] = True
    else: 
        draftData["EWA"].iloc[i] = np.mean(playerEWA["EWA"])
        draftData["did3years"].iloc[i] = False    
        
    if len(playerWP)>=3: # Take the first three years
        draftData["WP"] = np.mean(playerWP["WP"].iloc[:3].values)
        draftData["did3years"] = True
    else: 
        draftData["WP"].iloc[i] = np.mean(playerWP["WP"])
        draftData["did3years"].iloc[i] = False
        
    if len(playerWS)>=3: # Take the first three years
        draftData["WS"].iloc[i] = np.mean(playerWS["WS"].iloc[:3].values)
        draftData["did3years"].iloc[i] = True
    else: 
        draftData["WS"].iloc[i] = np.mean(playerWS["WS"])
        draftData["did3years"].iloc[i] = False


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [6]:
draftData.isna().sum()

Pk               0
Tm               0
Player           0
Year             0
YEAR           375
              ... 
EWA            189
WS             171
WP               4
did3years        0
didAnyYears      0
Length: 67, dtype: int64

In [7]:
draftData.to_pickle("newData.df")

In [8]:
draftData.columns

Index(['Pk', 'Tm', 'Player', 'Year', 'YEAR', 'POS', 'HEIGHT', 'HEIGHT_W_SHOES',
       'REACH', 'WEIGHT', 'WINGSPAN', 'WING_DIFF', 'SHUTTLE_RUN',
       'THREE_QUARTER_SPRINT', 'STANDING_VERTICAL', 'MAX_VERTICAL',
       'BENCH_PRESS', 'BODY_FAT', 'HAND_LENGTH', 'HAND_WIDTH', 'collegeYear',
       'gamesPlayed', 'minutes', 'FT%', '3P%', 'SOS', 'PER', 'eFG%', 'ORB%',
       'DRB%', 'AST%', 'TOV%', 'STL%', 'BLK%', 'USG%', 'OWS', 'DWS', 'FTA',
       'FGA', 'MP', '3PA', 'PTS', 'PF', 'AST', 'TOV', 'ORtg', 'DRtg',
       'MP_per_PF', 'FTA_per_FGA', 'MP_per_3PA', 'PTS_per_FGA', 'AST_per_TOV',
       'PPG', 'PPM', 'awards', 'RSCI', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6',
       'EWA', 'WS', 'WP', 'did3years', 'didAnyYears'],
      dtype='object')