## Feature Engineering

 - Covert game date to month only
 - Compile rolling means and current win streak for each team as home team and as visitor team 
 - Process data in sequential format for each team regardless whether home or away (necessary 1st step for following procedures.)
 - Compile sequential data into head-to-head matchup data for each team pair 
 - Compile sequential data into rolling means and current win streak for each team regardless of home or away
 - Merge sequential data features back into main dataframe
 

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)

from sklearn.preprocessing import PolynomialFeatures

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from src.common_functions import plot_corr_barchart, plot_corr_vs_target, run_sweetviz_report

from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'data')


In [2]:
train = pd.read_csv(DATAPATH / "train.csv")
test = pd.read_csv(DATAPATH / "test.csv")

train

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,PLAYOFF,CONFERENCE_x,G_x,W_x,L_x,W_PCT_x,HOME_W_x,HOME_L_x,HOME_W_PCT_x,ROAD_W_x,ROAD_L_x,ROAD_W_PCT_x,CONFERENCE_y,G_y,W_y,L_y,W_PCT_y,HOME_W_y,HOME_L_y,HOME_W_PCT_y,ROAD_W_y,ROAD_L_y,ROAD_W_PCT_y,TARGET
0,2016-04-25,41500124,1610612766,1610612748,2015,89.0,0.400,0.833,0.250,10.0,36.0,85.0,0.395,0.667,0.379,20.0,46.0,1,1,0,82,48,34,0.585,30,11,0.731707,18,23,0.439024,0,82,48,34,0.585,28,13,0.682927,20,21,0.487805,0.0
1,2016-04-23,41500123,1610612766,1610612748,2015,96.0,0.389,0.955,0.278,18.0,47.0,80.0,0.342,0.633,0.318,13.0,53.0,1,1,0,82,48,34,0.585,30,11,0.731707,18,23,0.439024,0,82,48,34,0.585,28,13,0.682927,20,21,0.487805,1.0
2,2014-04-28,41300114,1610612766,1610612748,2013,98.0,0.507,0.700,0.280,22.0,36.0,109.0,0.500,0.759,0.375,25.0,33.0,0,1,0,82,43,39,0.524,25,16,0.609756,18,23,0.439024,0,82,54,28,0.659,32,9,0.780488,22,19,0.536585,1.0
3,2014-04-26,41300113,1610612766,1610612748,2013,85.0,0.415,0.727,0.389,21.0,38.0,98.0,0.434,0.850,0.500,26.0,39.0,0,1,0,82,43,39,0.524,25,16,0.609756,18,23,0.439024,0,82,54,28,0.659,32,9,0.780488,22,19,0.536585,0.0
4,2010-04-26,40900114,1610612766,1610612753,2009,90.0,0.451,0.636,0.263,27.0,36.0,99.0,0.418,0.714,0.394,18.0,38.0,0,1,0,82,44,38,0.537,31,10,0.756098,13,28,0.317073,0,82,59,23,0.720,34,7,0.829268,25,16,0.609756,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23042,2003-11-22,20300177,1610612737,1610612739,2003,92.0,0.447,0.933,0.200,25.0,45.0,83.0,0.330,0.840,0.571,21.0,54.0,1,0,0,14,5,9,0.357,2,3,0.400000,3,6,0.333333,0,14,4,10,0.286,4,2,0.666667,0,8,0.000000,0.0
23043,2003-11-17,20300141,1610612737,1610612764,2003,97.0,0.423,0.872,0.231,12.0,39.0,106.0,0.390,0.750,0.375,19.0,44.0,0,0,0,11,3,8,0.273,1,3,0.250000,2,5,0.285714,0,10,4,6,0.400,2,3,0.400000,2,3,0.400000,1.0
23044,2003-11-15,20300128,1610612737,1610612751,2003,85.0,0.382,0.767,0.333,19.0,39.0,100.0,0.479,0.867,0.444,23.0,38.0,0,0,0,10,3,7,0.300,1,2,0.333333,2,5,0.285714,0,10,5,5,0.500,2,3,0.400000,3,2,0.600000,0.0
23045,2003-11-03,20300042,1610612737,1610612740,2003,90.0,0.427,0.652,0.333,20.0,50.0,80.0,0.407,0.588,0.222,21.0,42.0,1,0,0,4,1,3,0.250,1,1,0.500000,0,2,0.000000,0,4,3,1,0.750,2,0,1.000000,1,1,0.500000,0.0


In [3]:
def fix_datatypes(df):
    df['GAME_DATE_EST'] = pd.to_datetime(df['GAME_DATE_EST'])

    long_integer_fields = ['GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'SEASON']

    #convert long integer fields to int32 from int64
    for field in long_integer_fields:
        df[field] = df[field].astype('int32')
    
    #convert the remaining int64s to int8
    for field in df.select_dtypes(include=['int64']).columns.tolist():
        df[field] = df[field].astype('int8')
        
    #convert float64s to float16s
    for field in df.select_dtypes(include=['float64']).columns.tolist():
        df[field] = df[field].astype('float16')
        
    return df


In [4]:
def add_date_features(df):
    #convert game date to month to limit cardinality

    df['MONTH'] = df['GAME_DATE_EST'].dt.month
    
    return df

In [5]:
def remove_playoff_games(df):
    
    df = df[df["PLAYOFF"] == 0]
    
    df = df.drop("PLAYOFF", axis=1)
    
    return df

In [6]:
def add_rolling_home_visitor(df, location, roll_list): 
    
    # location = "HOME" or "VISITOR"
    # roll_list = list of number of games for each rolling mean, e.g. [3, 5, 7, 10, 15]

    # new version 2022-10-31
    # now ignoring season boundaries and with longer rolling means 
    # AND create a field where the all-team average is subtracted from each field
    
    
    # add features showing how well the home team has done in its last home games 
    # and how well the visitor team has done in its last away games
    # add rolling means 
    # add win streaks (negative number if losing streak)
    # these are for the home teams last  *home* games
    # and for the visitor teams last *away* games
    
    location_id = location + "_TEAM_ID"

    # sort games by the order in which they were played for each home or visitor team
    df = df.sort_values(by = [location_id, 'GAME_DATE_EST'], axis=0, ascending=[True, True,], ignore_index=True)
    
    # Win streak, negative if a losing streak
    df[location + '_TEAM_WIN_STREAK'] = df['HOME_TEAM_WINS'].groupby((df['HOME_TEAM_WINS'].shift() != df.groupby([location_id])['HOME_TEAM_WINS'].shift(2)).cumsum()).cumcount() + 1
    # if home team lost the last game of the streak, then the streak must be a losing streak. make it negative
    df[location + '_TEAM_WIN_STREAK'].loc[df['HOME_TEAM_WINS'].shift() == 0] =  -1 * df[location + '_TEAM_WIN_STREAK']

    # If visitor, the streak has opposite meaning (3 wins in a row for home team is 3 losses in a row for visitor)
    if location == 'VISITOR':
        df[location + '_TEAM_WIN_STREAK'] = - df[location + '_TEAM_WIN_STREAK']  


    # rolling means
    feature_list = ['HOME_TEAM_WINS', 'PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home']
    
    if location == 'VISITOR':
        feature_list = ['HOME_TEAM_WINS', 'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away']
    
      
    roll_feature_list = []
    for feature in feature_list:
        for roll in roll_list:
            roll_feature_name = location + '_' + feature + '_AVG_LAST_' + str(roll) + '_' + location
            if feature == 'HOME_TEAM_WINS': #remove the "HOME_" for better readability
                roll_feature_name = location + '_' + feature[5:] + '_AVG_LAST_' + str(roll) + '_' + location
            roll_feature_list.append(roll_feature_name)
            df[roll_feature_name] = df.groupby(['HOME_TEAM_ID'])[feature].rolling(roll, closed= "left").mean().values
            
    
    
    # determine league avg for each stat and then subtract it from the each team's avg
    # as a measure of how well that team compares to all teams in that moment in time
    
    #remove win averages from roll list - the league average will always be 0.5 (half the teams win, half lose)
    roll_feature_list = [x for x in roll_feature_list if not x.startswith('HOME_TEAM_WINS')]
    
    df = process_x_minus_league_avg(df, roll_feature_list, location_id)
    
 
    return df



In [7]:
def process_games_consecutively(df_data):
    # re-organize so that all of a team's games can be listed in chronological order whether HOME or VISITOR
    # this will facilitate feature engineering (winpct vs team X, 5-game winpct, current win streak, etc...)
    
    #this data will need to be re-linked back to the main dataframe after all processing is done,
    #joining TEAM1 to HOME_TEAM_ID for all records and then TEAM1 to VISITOR_TEAM_ID for all records
    
    #TEAM1 will be the key field. TEAM2 is used solely to process past team matchups

    # all the home games for each team will be selected and then stacked with all the away games
    
    df_home = pd.DataFrame()
    df_home['GAME_DATE_EST'] = df_data['GAME_DATE_EST']
    df_home['GAME_ID'] = df_data['GAME_ID']
    df_home['TEAM1'] = df_data['HOME_TEAM_ID']
    df_home['TEAM1_home'] = 1
    df_home['TEAM1_win'] = df_data['HOME_TEAM_WINS']
    df_home['TEAM2'] = df_data['VISITOR_TEAM_ID']
    df_home['SEASON'] = df_data['SEASON']
    
    df_home['PTS'] = df_data['PTS_home']
    df_home['FG_PCT'] = df_data['FG_PCT_home']
    df_home['FT_PCT'] = df_data['FT_PCT_home']
    df_home['FG3_PCT'] = df_data['FG3_PCT_home']
    df_home['AST'] = df_data['AST_home']
    df_home['REB'] = df_data['REB_home']
    
    # now for visitor teams  

    df_visitor = pd.DataFrame()
    df_visitor['GAME_DATE_EST'] = df_data['GAME_DATE_EST']
    df_visitor['GAME_ID'] = df_data['GAME_ID']
    df_visitor['TEAM1'] = df_data['VISITOR_TEAM_ID'] 
    df_visitor['TEAM1_home'] = 0
    df_visitor['TEAM1_win'] = df_data['HOME_TEAM_WINS'].apply(lambda x: 1 if x == 0 else 0)
    df_visitor['TEAM2'] = df_data['HOME_TEAM_ID']
    df_visitor['SEASON'] = df_data['SEASON']
    
    df_visitor['PTS'] = df_data['PTS_away']
    df_visitor['FG_PCT'] = df_data['FG_PCT_away']
    df_visitor['FT_PCT'] = df_data['FT_PCT_away']
    df_visitor['FG3_PCT'] = df_data['FG3_PCT_away']
    df_visitor['AST'] = df_data['AST_away']
    df_visitor['REB'] = df_data['REB_away']

    # merge dfs

    df = pd.concat([df_home, df_visitor])

    column2 = df.pop('TEAM1')
    column3 = df.pop('TEAM1_home')
    column4 = df.pop('TEAM2')
    column5 = df.pop('TEAM1_win')

    df.insert(2,'TEAM1', column2)
    df.insert(3,'TEAM1_home', column3)
    df.insert(4,'TEAM2', column4)
    df.insert(5,'TEAM1_win', column5)

    df = df.sort_values(by = ['TEAM1', 'GAME_ID'], axis=0, ascending=[True, True], ignore_index=True)

    return df


In [8]:
def add_matchups(df, roll_list):

    # new version 2022-11-06
    # now ignoring season boundaries and added roll parameters

    # group all the games that 2 teams played each other 
    # calculate home team win pct and the home team win/lose streak
    

    df = df.sort_values(by = ['TEAM1', 'TEAM2','GAME_DATE_EST'], axis=0, ascending=[True, True, True], ignore_index=True)

    for roll in roll_list:
        df['MATCHUP_WINPCT_' + str(roll)] = df.groupby(['TEAM1','TEAM2'])['TEAM1_win'].rolling(roll, closed= "left").mean().values

    df['MATCHUP_WIN_STREAK'] = df['TEAM1_win'].groupby((df['TEAM1_win'].shift() != df.groupby(['TEAM1','TEAM2'])['TEAM1_win'].shift(2)).cumsum()).cumcount() + 1
    # if team1 lost the last game of the streak, then the streak must be a losing streak. make it negative
    df['MATCHUP_WIN_STREAK'].loc[df['TEAM1_win'].shift() == 0] = -1 * df['MATCHUP_WIN_STREAK']
  
    
    return df



In [9]:
def process_x_minus_league_avg(df, feature_list, team_feature):

    # create a temp dataframe so that every date can be front-filled
    # we need the current average for all 30 teams for every day during the season
    # whether that team played or not. 
    # We will front-fill from previous days to ensure that every day has stats for every team
    
    
    # create feature list for temp dataframe to hold league averages
    temp_feature_list = feature_list.copy()
    temp_feature_list.append(team_feature)
    temp_feature_list.append("GAME_DATE_EST")
   
    df_temp = df[temp_feature_list]

    # populate the dataframe with all days played and forward fill previous value if a particular team did not play that day
    # https://stackoverflow.com/questions/70362869
    df_temp = (df_temp.set_index('GAME_DATE_EST')
            .groupby([team_feature])[feature_list]
            .apply(lambda x: x.asfreq('d', method = "ffill"))
            .reset_index()
            [temp_feature_list]
            )
    
    # find the average across all teams for each day
    df_temp = df_temp.groupby(['GAME_DATE_EST'])[feature_list].mean().reset_index()
    
    # rename features for merging
    df_temp = df_temp.add_suffix('_LEAGUE_AVG')
    temp_features = df_temp.columns
    
    # merge all-team averages with each record so that they can be subtracted
    df = df.sort_values(by = 'GAME_DATE_EST', axis=0, ascending= True, ignore_index=True)   
    df = pd.merge(df, df_temp, left_on='GAME_DATE_EST', right_on='GAME_DATE_EST_LEAGUE_AVG', how="left",)
    for feature in feature_list:
        df[feature + "_MINUS_LEAGUE_AVG"] = df[feature] - df[feature + "_LEAGUE_AVG"]

    # drop temp features that were only used for subtraction
    df = df.drop(temp_features, axis = 1)
    
    return df

In [10]:
def add_past_performance_all(df, roll_list):
    
    # roll_list = list of number of games for each rolling mean, e.g. [3, 5, 7, 10, 15]
    
    # new version 2022-11-03
    # now ignoring season boundaries and with longer rolling means (20 and 40 games)
    # AND create a field where the all-team average is subtracted from each field
   
    # add features showing how well each team has done in its last games
    # regardless whether they were at home or away
    # add rolling means for last 3, 5, 7, 10, 20, 40 games
    # add win streaks (negative number if losing streak)
    
    #this data will need to be re-linked back to the main dataframe after all processing is done,
    #joining TEAM1 to HOME_TEAM_ID for all records and then TEAM1 to VISITOR_TEAM_ID for all records
    
    #TEAM1 will be the key field. TEAM2 was used solely to process past team matchups


    df = df.sort_values(by = ['TEAM1','GAME_DATE_EST'], axis=0, ascending=[True, True,], ignore_index=True)
  
    #streak of games won/lost, make negative is a losing streak
    df['WIN_STREAK'] = df['TEAM1_win'].groupby((df['TEAM1_win'].shift() != df.groupby(['TEAM1'])['TEAM1_win'].shift(2)).cumsum()).cumcount() + 1   
    # if team1 lost the last game of the streak, then the streak must be a losing streak. make it negative
    df['WIN_STREAK'].loc[df['TEAM1_win'].shift() == 0]  = -1 * df['WIN_STREAK']
    
    #streak of games played at home/away, make negative if away streak
    df['HOME_AWAY_STREAK'] = df['TEAM1_home'].groupby((df['TEAM1_home'].shift() != df.groupby(['TEAM1'])['TEAM1_home'].shift(2)).cumsum()).cumcount() + 1
    # if team1 played the game of the streak away, then the streak must be an away streak. make it negative
    df['HOME_AWAY_STREAK'].loc[df['TEAM1_home'].shift() == 0]  = -1 * df['HOME_AWAY_STREAK']
    
    #rolling means 
    
    feature_list = ['TEAM1_win', 'PTS', 'FG_PCT', 'FT_PCT', 'FG3_PCT', 'AST', 'REB']
   
    #create new feature names based upon rolling period
    
    roll_feature_list =[]

    for feature in feature_list:
        for roll in roll_list:
            roll_feature_name = feature + '_AVG_LAST_' + str(roll) + '_ALL'
            roll_feature_list.append(roll_feature_name)
            df[roll_feature_name] = df.groupby(['TEAM1'])[feature].rolling(roll, closed= "left").mean().values

    
    
    # determine league avg for each stat and then subtract it from the each team's average
    # as a measure of how well that team compares to all teams in that moment in time
    
    #remove win averages from roll list - the league average will always be 0.5 (half the teams win, half lose)
    roll_feature_list = [x for x in roll_feature_list if not x.startswith('TEAM1_win')]
    
    df = process_x_minus_league_avg(df, roll_feature_list, 'TEAM1')
    
    
    return df



In [11]:
def combine_new_features(df, df_consecutive):
     
    # add back all the new features created in the consecutive dataframe to the main dataframe
    # all data for TEAM1 will be applied to the home team and then again to the visitor team
    # except for head-to-head MATCHUP data, which will only be applied to home team (redundant to include for both)
    # the letter '_x' will be appeneded to feature names when adding to home team
    # the letter '_y' will be appended to feature names when adding to visitor team
    # to match the existing convention in the dataset
    
    #first select out the new features
    all_features = df_consecutive.columns.tolist()
    link_features = ['GAME_ID', 'TEAM1', ]
    redundant_features = ['GAME_DATE_EST','TEAM1_home','TEAM1_win','TEAM2','SEASON','PTS', 'FG_PCT', 'FT_PCT', 'FG3_PCT', 'AST', 'REB',]
    matchup_features = [x for x in all_features if "MATCHUP" in x]
    ignore_features = link_features + redundant_features
    
    new_features = [x for x in all_features if x not in ignore_features]
    
    # first home teams
    
    df1 = df_consecutive[df_consecutive['TEAM1_home'] == 1]
    #add "_x" to new features
    df1.columns = [x + '_x' if x in new_features else x for x in df1.columns]
    #drop features that don't need to be merged
    df1 = df1.drop(redundant_features,axis=1)
    #change TEAM1 to HOME_TEAM_ID for easy merging
    df1 = df1.rename(columns={'TEAM1': 'HOME_TEAM_ID'})
    df = pd.merge(df, df1, how="left", on=["GAME_ID", "HOME_TEAM_ID"])
    
    #don't include matchup features for visitor team since they are equivant for both home and visitor
    new_features = [x for x in new_features if x not in matchup_features]
    df_consecutive = df_consecutive.drop(matchup_features,axis=1)
    
    # next visitor teams
    
    df2 = df_consecutive[df_consecutive['TEAM1_home'] == 0]
    #add "_y" to new features
    df2.columns = [x + '_y' if x in new_features else x for x in df2.columns]
    #drop features that don't need to be merged
    df2 = df2.drop(redundant_features,axis=1)
    #change TEAM1 to VISITOR_TEAM_ID for easy merging
    df2 = df2.rename(columns={'TEAM1': 'VISITOR_TEAM_ID'})
    df = pd.merge(df, df2, how="left", on=["GAME_ID", "VISITOR_TEAM_ID"])
    
    return df

In [12]:
def process_x_minus_y(df):
    #Subtract visitor teams stats from the home teams stats for key fields
    # field_x - field_y
    
    all_features = df.columns.tolist()
    comparison_features = [x for x in all_features if "_y" in x]
    
    #don't include redunant features. (x - league_avg) - (y - league_avg) = x-y
    comparison_features = [x for x in comparison_features if "_MINUS_LEAGUE_AVG" not in x]
    
    for feature in comparison_features:
        feature_base = feature[:-2] #remove "_y" from the end
        df[feature_base + "_x_minus_y"] = df[feature_base + "_x"] - df[feature_base + "_y"]
        
    #df = df.drop("CONFERENCE_x_minus_y") #category variable not meaningful?
        
    return df
    

In [13]:
def remove_non_rolling(df):
    
    # remove non-rolling features - these are data leaks
    # they are stats from the actual game that decides winner/loser, 
    # but we don't know these stats before a game is played
    
    drop_columns =[]
    
    all_columns = df.columns.tolist()
    
    drop_columns1 = ['HOME_TEAM_WINS','PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home']
    drop_columns2 = ['PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away']
    
    drop_columns = drop_columns + drop_columns1
    drop_columns = drop_columns + drop_columns2 
    
    use_columns = [item for item in all_columns if item not in drop_columns]
    
    return df[use_columns]
    

In [14]:
def add_all_features(df):
    
    # lists for the number of games to including in rolling periods
    #home_visitor_roll_list = [3, 5, 7, 10, 15]
    #all_roll_list = [3, 5, 7, 10, 20, 40]
    home_visitor_roll_list = [3, 15]
    all_roll_list = [3, 20]
        
    df = remove_playoff_games(df)
    df = fix_datatypes(df)
    df = add_date_features(df)
    df = add_rolling_home_visitor(df, "HOME", home_visitor_roll_list)
    df = add_rolling_home_visitor(df, "VISITOR", home_visitor_roll_list)

    #games must first be processed to sort all games in order per team
    #regardless whether home or away
    df_consecutive = process_games_consecutively(df)
    df_consecutive = add_matchups(df_consecutive, home_visitor_roll_list)
    df_consecutive = add_past_performance_all(df_consecutive, all_roll_list)

    #add these features back to main dataframe
    df = combine_new_features(df,df_consecutive)
    
    df['TARGET'] = df['HOME_TEAM_WINS']

    
    df = remove_non_rolling(df)
    
    df = process_x_minus_y(df)
    
    return df

### Add Features

In [15]:
train_features = add_all_features(train)
test_features = add_all_features(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[location + '_TEAM_WIN_STREAK'].loc[df['HOME_TEAM_WINS'].shift() == 0] =  -1 * df[location + '_TEAM_WIN_STREAK']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[location + '_TEAM_WIN_STREAK'].loc[df['HOME_TEAM_WINS'].shift() == 0] =  -1 * df[location + '_TEAM_WIN_STREAK']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['MATCHUP_WIN_STREAK'].loc[df['TEAM1_win'].shift() == 0] = -1 * df['MATCHUP_WIN_STREAK']
A value is trying to be set on a copy of a slice from a

### Save data

In [16]:
#remove 2nd to last season from test set
#it was needed to just generate rolling mean that began in previous season

latest_season = test_features['SEASON'].unique().max()
test_features = test_features[test_features['SEASON'] >= (latest_season)]

In [17]:
train_features.to_csv(DATAPATH / "train_features.csv",index=False)
test_features.to_csv(DATAPATH / "test_features.csv",index=False)

### Visualization

In [18]:
# correlation bar chart
  
drop_cols = ['GAME_ID']
n = 30

#plot_corr_barchart(train_features,drop_cols,n)

In [19]:
# correlations vs target

target = 'TARGET'
n = 30
drop_cols = ['GAME_ID','TARGET']

#plot_corr_vs_target(train_features, target, drop_cols, n)

In [20]:
# run sweetviz report

#run_sweetviz_report(train_features,'TARGET')

In [21]:
train_features

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,CONFERENCE_x,G_x,W_x,L_x,W_PCT_x,HOME_W_x,HOME_L_x,HOME_W_PCT_x,ROAD_W_x,ROAD_L_x,ROAD_W_PCT_x,CONFERENCE_y,G_y,W_y,L_y,W_PCT_y,HOME_W_y,HOME_L_y,HOME_W_PCT_y,ROAD_W_y,ROAD_L_y,ROAD_W_PCT_y,TARGET,MONTH,HOME_TEAM_WIN_STREAK,HOME_TEAM_WINS_AVG_LAST_3_HOME,HOME_TEAM_WINS_AVG_LAST_15_HOME,HOME_PTS_home_AVG_LAST_3_HOME,HOME_PTS_home_AVG_LAST_15_HOME,HOME_FG_PCT_home_AVG_LAST_3_HOME,HOME_FG_PCT_home_AVG_LAST_15_HOME,HOME_FT_PCT_home_AVG_LAST_3_HOME,HOME_FT_PCT_home_AVG_LAST_15_HOME,HOME_FG3_PCT_home_AVG_LAST_3_HOME,HOME_FG3_PCT_home_AVG_LAST_15_HOME,HOME_AST_home_AVG_LAST_3_HOME,HOME_AST_home_AVG_LAST_15_HOME,HOME_REB_home_AVG_LAST_3_HOME,HOME_REB_home_AVG_LAST_15_HOME,HOME_PTS_home_AVG_LAST_3_HOME_MINUS_LEAGUE_AVG,HOME_PTS_home_AVG_LAST_15_HOME_MINUS_LEAGUE_AVG,HOME_FG_PCT_home_AVG_LAST_3_HOME_MINUS_LEAGUE_AVG,HOME_FG_PCT_home_AVG_LAST_15_HOME_MINUS_LEAGUE_AVG,HOME_FT_PCT_home_AVG_LAST_3_HOME_MINUS_LEAGUE_AVG,HOME_FT_PCT_home_AVG_LAST_15_HOME_MINUS_LEAGUE_AVG,HOME_FG3_PCT_home_AVG_LAST_3_HOME_MINUS_LEAGUE_AVG,HOME_FG3_PCT_home_AVG_LAST_15_HOME_MINUS_LEAGUE_AVG,HOME_AST_home_AVG_LAST_3_HOME_MINUS_LEAGUE_AVG,HOME_AST_home_AVG_LAST_15_HOME_MINUS_LEAGUE_AVG,HOME_REB_home_AVG_LAST_3_HOME_MINUS_LEAGUE_AVG,HOME_REB_home_AVG_LAST_15_HOME_MINUS_LEAGUE_AVG,VISITOR_TEAM_WIN_STREAK,VISITOR_TEAM_WINS_AVG_LAST_3_VISITOR,VISITOR_TEAM_WINS_AVG_LAST_15_VISITOR,VISITOR_PTS_away_AVG_LAST_3_VISITOR,VISITOR_PTS_away_AVG_LAST_15_VISITOR,VISITOR_FG_PCT_away_AVG_LAST_3_VISITOR,VISITOR_FG_PCT_away_AVG_LAST_15_VISITOR,VISITOR_FT_PCT_away_AVG_LAST_3_VISITOR,VISITOR_FT_PCT_away_AVG_LAST_15_VISITOR,VISITOR_FG3_PCT_away_AVG_LAST_3_VISITOR,VISITOR_FG3_PCT_away_AVG_LAST_15_VISITOR,VISITOR_AST_away_AVG_LAST_3_VISITOR,VISITOR_AST_away_AVG_LAST_15_VISITOR,VISITOR_REB_away_AVG_LAST_3_VISITOR,VISITOR_REB_away_AVG_LAST_15_VISITOR,VISITOR_TEAM_WINS_AVG_LAST_3_VISITOR_MINUS_LEAGUE_AVG,VISITOR_TEAM_WINS_AVG_LAST_15_VISITOR_MINUS_LEAGUE_AVG,VISITOR_PTS_away_AVG_LAST_3_VISITOR_MINUS_LEAGUE_AVG,VISITOR_PTS_away_AVG_LAST_15_VISITOR_MINUS_LEAGUE_AVG,VISITOR_FG_PCT_away_AVG_LAST_3_VISITOR_MINUS_LEAGUE_AVG,VISITOR_FG_PCT_away_AVG_LAST_15_VISITOR_MINUS_LEAGUE_AVG,VISITOR_FT_PCT_away_AVG_LAST_3_VISITOR_MINUS_LEAGUE_AVG,VISITOR_FT_PCT_away_AVG_LAST_15_VISITOR_MINUS_LEAGUE_AVG,VISITOR_FG3_PCT_away_AVG_LAST_3_VISITOR_MINUS_LEAGUE_AVG,VISITOR_FG3_PCT_away_AVG_LAST_15_VISITOR_MINUS_LEAGUE_AVG,VISITOR_AST_away_AVG_LAST_3_VISITOR_MINUS_LEAGUE_AVG,VISITOR_AST_away_AVG_LAST_15_VISITOR_MINUS_LEAGUE_AVG,VISITOR_REB_away_AVG_LAST_3_VISITOR_MINUS_LEAGUE_AVG,VISITOR_REB_away_AVG_LAST_15_VISITOR_MINUS_LEAGUE_AVG,MATCHUP_WINPCT_3_x,MATCHUP_WINPCT_15_x,MATCHUP_WIN_STREAK_x,WIN_STREAK_x,HOME_AWAY_STREAK_x,TEAM1_win_AVG_LAST_3_ALL_x,TEAM1_win_AVG_LAST_20_ALL_x,PTS_AVG_LAST_3_ALL_x,PTS_AVG_LAST_20_ALL_x,FG_PCT_AVG_LAST_3_ALL_x,FG_PCT_AVG_LAST_20_ALL_x,FT_PCT_AVG_LAST_3_ALL_x,FT_PCT_AVG_LAST_20_ALL_x,FG3_PCT_AVG_LAST_3_ALL_x,FG3_PCT_AVG_LAST_20_ALL_x,AST_AVG_LAST_3_ALL_x,AST_AVG_LAST_20_ALL_x,REB_AVG_LAST_3_ALL_x,REB_AVG_LAST_20_ALL_x,PTS_AVG_LAST_3_ALL_MINUS_LEAGUE_AVG_x,PTS_AVG_LAST_20_ALL_MINUS_LEAGUE_AVG_x,FG_PCT_AVG_LAST_3_ALL_MINUS_LEAGUE_AVG_x,FG_PCT_AVG_LAST_20_ALL_MINUS_LEAGUE_AVG_x,FT_PCT_AVG_LAST_3_ALL_MINUS_LEAGUE_AVG_x,FT_PCT_AVG_LAST_20_ALL_MINUS_LEAGUE_AVG_x,FG3_PCT_AVG_LAST_3_ALL_MINUS_LEAGUE_AVG_x,FG3_PCT_AVG_LAST_20_ALL_MINUS_LEAGUE_AVG_x,AST_AVG_LAST_3_ALL_MINUS_LEAGUE_AVG_x,AST_AVG_LAST_20_ALL_MINUS_LEAGUE_AVG_x,REB_AVG_LAST_3_ALL_MINUS_LEAGUE_AVG_x,REB_AVG_LAST_20_ALL_MINUS_LEAGUE_AVG_x,WIN_STREAK_y,HOME_AWAY_STREAK_y,TEAM1_win_AVG_LAST_3_ALL_y,TEAM1_win_AVG_LAST_20_ALL_y,PTS_AVG_LAST_3_ALL_y,PTS_AVG_LAST_20_ALL_y,FG_PCT_AVG_LAST_3_ALL_y,FG_PCT_AVG_LAST_20_ALL_y,FT_PCT_AVG_LAST_3_ALL_y,FT_PCT_AVG_LAST_20_ALL_y,FG3_PCT_AVG_LAST_3_ALL_y,FG3_PCT_AVG_LAST_20_ALL_y,AST_AVG_LAST_3_ALL_y,AST_AVG_LAST_20_ALL_y,REB_AVG_LAST_3_ALL_y,REB_AVG_LAST_20_ALL_y,PTS_AVG_LAST_3_ALL_MINUS_LEAGUE_AVG_y,PTS_AVG_LAST_20_ALL_MINUS_LEAGUE_AVG_y,FG_PCT_AVG_LAST_3_ALL_MINUS_LEAGUE_AVG_y,FG_PCT_AVG_LAST_20_ALL_MINUS_LEAGUE_AVG_y,FT_PCT_AVG_LAST_3_ALL_MINUS_LEAGUE_AVG_y,FT_PCT_AVG_LAST_20_ALL_MINUS_LEAGUE_AVG_y,FG3_PCT_AVG_LAST_3_ALL_MINUS_LEAGUE_AVG_y,FG3_PCT_AVG_LAST_20_ALL_MINUS_LEAGUE_AVG_y,AST_AVG_LAST_3_ALL_MINUS_LEAGUE_AVG_y,AST_AVG_LAST_20_ALL_MINUS_LEAGUE_AVG_y,REB_AVG_LAST_3_ALL_MINUS_LEAGUE_AVG_y,REB_AVG_LAST_20_ALL_MINUS_LEAGUE_AVG_y,CONFERENCE_x_minus_y,G_x_minus_y,W_x_minus_y,L_x_minus_y,W_PCT_x_minus_y,HOME_W_x_minus_y,HOME_L_x_minus_y,HOME_W_PCT_x_minus_y,ROAD_W_x_minus_y,ROAD_L_x_minus_y,ROAD_W_PCT_x_minus_y,WIN_STREAK_x_minus_y,HOME_AWAY_STREAK_x_minus_y,TEAM1_win_AVG_LAST_3_ALL_x_minus_y,TEAM1_win_AVG_LAST_20_ALL_x_minus_y,PTS_AVG_LAST_3_ALL_x_minus_y,PTS_AVG_LAST_20_ALL_x_minus_y,FG_PCT_AVG_LAST_3_ALL_x_minus_y,FG_PCT_AVG_LAST_20_ALL_x_minus_y,FT_PCT_AVG_LAST_3_ALL_x_minus_y,FT_PCT_AVG_LAST_20_ALL_x_minus_y,FG3_PCT_AVG_LAST_3_ALL_x_minus_y,FG3_PCT_AVG_LAST_20_ALL_x_minus_y,AST_AVG_LAST_3_ALL_x_minus_y,AST_AVG_LAST_20_ALL_x_minus_y,REB_AVG_LAST_3_ALL_x_minus_y,REB_AVG_LAST_20_ALL_x_minus_y
0,2003-10-28,20300003,1610612747,1610612742,2003,1,1,1,0,1.000000,1,0,1.000000,0,0,,1,1,0,1,0.000000,0,0,,0,1,0.000000,1,10,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,-1,0.666667,0.733333,92.000000,89.600000,0.431722,0.437874,0.737467,0.690365,0.362630,0.335872,19.666667,19.733333,41.000000,39.600000,-0.111111,-0.022222,-4.222222,-4.333333,0.004720,0.010151,0.019803,-0.022928,0.043199,0.018197,-0.666667,-1.155556,-2.111111,-1.933333,,,1,-1,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,1,-1,1.000000,1,0,,0,-1,,-2,-2,,,,,,,,,,,,,,
1,2003-10-28,20300001,1610612755,1610612748,2003,0,1,1,0,1.000000,1,0,1.000000,0,0,,0,1,0,1,0.000000,0,0,,0,1,0.000000,1,10,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0.666667,0.800000,98.000000,96.466667,0.452637,0.435710,0.662923,0.669922,0.266317,0.309823,22.666667,21.933333,46.333333,40.733333,-0.111111,0.044444,1.777778,2.533333,0.025635,0.007986,-0.054742,-0.043370,-0.053114,-0.007853,2.333333,1.044444,3.222222,-0.800000,,,1,1,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,1,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,1,-1,1.000000,1,0,,0,-1,,0,0,,,,,,,,,,,,,,
2,2003-10-28,20300002,1610612759,1610612756,2003,1,1,1,0,1.000000,1,0,1.000000,0,0,,1,1,0,1,0.000000,0,0,,0,1,0.000000,1,10,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,-1,1.000000,0.733333,98.666667,95.733333,0.396647,0.409587,0.752604,0.779590,0.329346,0.307332,18.666667,21.000000,42.000000,44.266667,0.222222,-0.022222,2.444444,1.800000,-0.030355,-0.018137,0.034939,0.066298,0.009915,-0.010343,-1.666667,0.111111,-1.111111,2.733333,,,-1,-1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,1,-1,1.000000,1,0,,0,-1,,-2,0,,,,,,,,,,,,,,
3,2003-10-29,20300010,1610612741,1610612764,2003,0,1,0,1,0.000000,0,1,0.000000,0,0,,0,1,1,0,1.000000,0,0,,1,0,1.000000,0,10,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,-1,0.000000,0.666667,110.333333,94.466667,0.451660,0.440137,0.811686,0.713184,0.414714,0.326408,20.666667,20.400000,48.000000,42.866667,-0.500000,0.026667,7.333333,-3.800000,-0.002661,-0.007106,0.039095,-0.017995,0.041606,-0.026584,0.533333,-0.166667,4.500000,1.213333,,,-1,-1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,-1,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,-1,1,-1.000000,0,1,,-1,0,,0,2,,,,,,,,,,,,,,
4,2003-10-29,20300004,1610612738,1610612748,2003,0,1,1,0,1.000000,1,0,1.000000,0,0,,0,2,0,2,0.000000,0,0,,0,2,0.000000,1,10,1,,,,,,,,,,,,,,,,,,,,,,,,,,,-1,0.333333,0.800000,104.000000,96.600000,0.479980,0.435254,0.713053,0.658333,0.290649,0.311157,23.000000,22.133333,47.333333,41.400000,-0.166667,0.160000,1.000000,-1.666667,0.025659,-0.011989,-0.059538,-0.072845,-0.082458,-0.041835,2.866667,1.566667,3.833333,-0.253333,,,1,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,-1,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,0,-1,1,-2,1.000000,1,0,,0,-2,,2,2,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21532,2021-05-16,22001073,1610612752,1610612738,2020,0,72,41,31,0.568848,25,11,0.694336,16,20,0.444336,0,72,36,36,0.500000,21,15,0.583496,15,21,0.416748,1,5,3,1.000000,0.800000,111.000000,112.533333,0.460042,0.462712,0.898275,0.845215,0.425293,0.385677,26.333333,22.133333,46.333333,45.133333,-3.888889,-0.213333,-0.009353,-0.007266,0.103635,0.058976,0.052287,0.024286,1.155556,-2.475556,-0.111111,-0.120000,1,1.000000,0.800000,91.000000,96.000000,0.413330,0.431624,0.589274,0.719482,0.313965,0.333594,23.000000,22.266667,33.333333,40.333333,0.333333,0.053333,-9.777778,0.195556,-0.035672,-0.008542,-0.162131,-0.001115,-0.045871,-0.005848,1.133333,1.626667,-8.000000,-0.657778,0.333333,0.266667,-1,2,2,0.666667,0.75,106.333333,112.80,0.436442,0.469824,0.904297,0.842725,0.392253,0.420374,23.666667,22.60,46.333333,44.00,-5.566667,-0.108333,-0.026994,-0.000351,0.116569,0.062282,0.034783,0.054955,-1.444444,-2.568333,1.555556,-0.345,1,-2,0.333333,0.55,113.000000,115.05,0.454264,0.464380,0.712891,0.797461,0.402669,0.379456,24.000000,24.50,40.666667,45.45,1.100000,2.141667,-0.009172,-0.005795,-0.074837,0.017018,0.045200,0.014037,-1.111111,-0.668333,-4.111111,1.105,0,0,5,-5,0.068848,4,-4,0.110840,1,-1,0.027588,1,4,0.333333,0.20,-6.666667,-2.25,-0.017822,0.005444,0.191406,0.045264,-0.010417,0.040918,-0.333333,-1.90,5.666667,-1.45
21533,2021-05-16,22001074,1610612760,1610612746,2020,1,72,22,50,0.305908,10,26,0.277832,12,24,0.333252,1,72,47,25,0.652832,26,10,0.722168,21,15,0.583496,1,5,-11,0.000000,0.066667,104.000000,103.000000,0.424967,0.432520,0.788737,0.694271,0.351644,0.322388,19.000000,20.733333,49.666667,48.133333,-10.888889,-9.746667,-0.044428,-0.037458,-0.005903,-0.091968,-0.021362,-0.039003,-6.177778,-3.875556,3.222222,2.880000,-1,0.666667,0.600000,115.000000,99.000000,0.463298,0.448861,0.824219,0.689811,0.370972,0.365129,24.333333,21.800000,41.333333,41.466667,0.000000,-0.146667,14.222222,3.195556,0.014296,0.008695,0.072814,-0.030787,0.011136,0.025687,2.466667,1.160000,0.000000,0.475556,0.000000,0.466667,-4,-9,1,0.000000,0.05,99.000000,102.55,0.428060,0.428955,0.757324,0.696680,0.334717,0.323175,19.333333,21.15,42.666667,46.80,-12.900000,-10.358333,-0.035376,-0.041220,-0.030404,-0.083763,-0.022753,-0.042244,-5.777778,-4.018333,-2.111111,2.455,-1,-3,0.666667,0.70,114.333333,113.45,0.505697,0.489063,0.894368,0.840894,0.375000,0.411877,27.333333,25.65,42.666667,43.50,2.433333,0.541667,0.042261,0.018888,0.106641,0.060451,0.017531,0.046459,2.222222,0.481667,-2.111111,-0.845,0,0,-25,25,-0.346924,-16,16,-0.444336,-9,9,-0.250244,-8,4,-0.666667,-0.65,-15.333333,-10.90,-0.077637,-0.060107,-0.137044,-0.144214,-0.040283,-0.088702,-8.000000,-4.50,0.000000,3.30
21534,2021-05-16,22001067,1610612751,1610612739,2020,0,72,48,24,0.666992,28,8,0.777832,20,16,0.555664,0,72,22,50,0.305908,13,23,0.361084,9,27,0.250000,1,5,2,0.666667,0.866667,114.000000,115.733333,0.499919,0.497396,0.727702,0.809408,0.385986,0.368237,26.333333,27.866667,45.666667,43.933333,-0.888889,2.986667,0.030523,0.027419,-0.066938,0.023168,0.012980,0.006846,1.155556,3.257778,-0.777778,-1.320000,-7,1.000000,0.800000,98.666667,90.800000,0.446045,0.428060,0.777995,0.736995,0.271973,0.319889,24.333333,20.666667,40.000000,39.133333,0.333333,0.053333,-2.111111,-5.004444,-0.002957,-0.012106,0.026590,0.016398,-0.087863,-0.019552,2.466667,0.026667,-1.333333,-1.857778,0.333333,0.333333,-2,4,2,1.000000,0.60,116.000000,118.20,0.516927,0.494971,0.700358,0.817456,0.401693,0.383362,30.333333,26.80,48.666667,45.20,4.100000,5.291667,0.053491,0.024796,-0.087370,0.037013,0.044223,0.017943,5.222222,1.631667,3.888889,0.855,-1,-1,0.333333,0.15,103.000000,104.05,0.423014,0.448547,0.797363,0.781641,0.314697,0.338953,24.000000,25.00,49.666667,42.15,-8.900000,-8.858333,-0.040422,-0.021628,0.009635,0.001198,-0.042772,-0.026466,-1.111111,-0.168333,4.888889,-2.195,0,0,26,-26,0.361084,15,-15,0.416748,11,-11,0.305664,5,3,0.666667,0.45,13.000000,14.15,0.093913,0.046423,-0.097005,0.035815,0.086995,0.044409,6.333333,1.80,-1.000000,3.05
21535,2021-05-16,22001069,1610612765,1610612748,2020,0,72,20,52,0.278076,13,23,0.361084,7,29,0.194458,0,72,40,32,0.556152,21,15,0.583496,19,17,0.527832,0,5,-3,0.000000,0.400000,95.666667,103.400000,0.444336,0.461849,0.699544,0.726497,0.334676,0.346606,21.333333,23.400000,41.333333,43.400000,-19.222222,-9.346667,-0.025060,-0.008128,-0.095095,-0.059742,-0.038330,-0.014785,-3.844444,-1.208889,-5.111111,-1.853333,-1,1.000000,0.800000,89.666667,92.066667,0.407633,0.438477,0.807943,0.744173,0.262370,0.332943,12.666667,18.200000,41.333333,37.066667,0.333333,0.053333,-11.111111,-3.737778,-0.041368,-0.001689,0.056538,0.023575,-0.097466,-0.006499,-9.200000,-2.440000,0.000000,-3.924444,0.333333,0.466667,-1,-4,3,0.000000,0.25,95.666667,104.30,0.444336,0.465076,0.699544,0.729248,0.334676,0.343292,21.333333,23.30,41.333333,43.60,-16.233333,-8.608333,-0.019100,-0.005099,-0.088184,-0.051195,-0.022793,-0.022127,-3.777778,-1.868333,-3.444444,-0.745,-1,-1,0.666667,0.65,114.333333,111.40,0.505208,0.484973,0.840983,0.800537,0.451090,0.389465,25.333333,26.45,42.666667,39.15,2.433333,-1.508333,0.041772,0.014798,0.053255,0.020094,0.093621,0.024046,0.222222,1.281667,-2.111111,-5.195,0,0,-20,20,-0.278076,-8,8,-0.222412,-12,12,-0.333496,-3,4,-0.666667,-0.40,-18.666667,-7.10,-0.060872,-0.019897,-0.141439,-0.071289,-0.116414,-0.046173,-4.000000,-3.15,-1.333333,4.45
