for Feature Engeneering

In [None]:
def process_date_column(df, date_col):
    """
    Split datetime it into year, month, day, 
    and categorize the hour into time periods: 'Noon', 'After Noon', and 'Night'.
    
    Parameters:
        df (pd.DataFrame): The DataFrame containing the date column.
        date_col (str): The name of the date column.
    
    Returns:
        pd.DataFrame: The DataFrame with new columns 'year', 'month', 'day', and 'time_category'.
    """
    
    # Extract year, month, and day
    df['year'] = df[date_col].dt.year
    df['month'] = df[date_col].dt.month
    df['day'] = df[date_col].dt.day
    
    # Define a helper function to categorize the hour
    def categorize_time(dt):
        hour = dt.hour
        if 10 <= hour < 14:
            return 'Noon'
        elif 14 <= hour < 18:
            return 'After Noon'
        else:
            return 'Night'
    
    # Apply the categorization function to create the time category column
    df['time_category'] = df[date_col].apply(categorize_time)
    
    return df

# Example usage:
# Assuming your DataFrame is named df and the date column is called 'date'
df = process_date_column(df, 'date')
print(df[['date', 'year', 'month', 'day', 'time_category']].head())


In [None]:
df['time_category'].value_counts()

## deal with day difference between games

In [None]:
def team_day_diff_with_season(team_df, date_col='date', season_col='season', default_diff=14):
    """
    For games of a single team, compute the day difference between consecutive games.
    
    The DataFrame team_df is assumed to contain only the games for one team, with a date 
    column (convertible to datetime) and a season column.
    
    - Sorts team_df by date.
    - Computes the difference in days between consecutive games.
    - For the first game, or when the season changes, sets the day_diff to default_diff.
    
    Parameters
    ----------
    team_df : pd.DataFrame
        DataFrame with games for one team.
    date_col : str, default 'date'
        Name of the date column.
    season_col : str, default 'season'
        Name of the season column.
    default_diff : int, default 14
        The default day difference for the first game of a team or the first game of a new season.
    
    Returns
    -------
    pd.DataFrame
        team_df with an additional column 'day_diff' showing the computed differences.
    """
    import pandas as pd
    
    # Ensure the date column is datetime type
    team_df = team_df.copy()
    team_df[date_col] = pd.to_datetime(team_df[date_col])
    
    # Sort by date
    team_df = team_df.sort_values(date_col).reset_index(drop=True)
    
    # Compute difference in days between consecutive games
    team_df['day_diff'] = team_df[date_col].diff().dt.days
    
    # For the first game of the team, set day_diff to default_diff
    team_df.loc[0, 'day_diff'] = default_diff
    
    # For games where the season changes, override day_diff to default_diff.
    # We assume that a season change is detected when the season value differs from the previous row.
    team_df['prev_season'] = team_df[season_col].shift(1)
    team_df.loc[team_df[season_col] != team_df['prev_season'], 'day_diff'] = default_diff
    team_df.drop(columns='prev_season', inplace=True)
    
    # Replace any remaining missing values with default_diff
    team_df['day_diff'] = team_df['day_diff'].fillna(default_diff).astype(int)
    
    return team_df

# Example usage:
# Suppose your games DataFrame is named df_games.
# We want to analyze the schedule for team with ID 89.
def team_day_diff(df_games, team_id, date_col='date', season_col='season', default_diff=14):
    """
    Filters the games DataFrame for a specific team (home or away), then computes day_diff for that team.
    """
    team_games = df_games[(df_games['homeTeamID'] == team_id) | (df_games['awayTeamID'] == team_id)].copy()
    return team_day_diff_with_season(team_games, date_col, season_col, default_diff)

# Now, inspect the day differences for team 89:
team89 = team_day_diff(df_games, team_id=89)
print(team89[['date', 'season', 'day_diff']].head(20))

inspect day difference

In [None]:
# inspect that day difference between matches is correct
def team_day_diff(df, team_id, date_col='date'):
    """
    Filters the DataFrame to games involving the given team (as home or away),
    sorts the games by date, and computes the day difference between consecutive games.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The DataFrame containing game data.
    team_id : int (or compatible type)
        The team ID to filter on.
    date_col : str, optional
        The name of the date column. Default is 'date'.
    
    Returns:
    --------
    pd.DataFrame
        A DataFrame of games involving the team with a new column 'day_diff' that shows
        the difference in days between consecutive games.
    """
    # Filter games where the team appears as either home or away
    team_games = df[(df['homeTeamID'] == team_id) | (df['awayTeamID'] == team_id)].copy()
    
    # Ensure the date column is datetime type
    team_games[date_col] = pd.to_datetime(team_games[date_col])
    
    # Sort by date and compute the difference in days
    team_games = team_games.sort_values(date_col).reset_index(drop=True)
    team_games['day_diff'] = team_games[date_col].diff().dt.days.fillna(0).astype(int)
    
    return team_games

# Example usage:
# Suppose you want to inspect games for team with ID 89.
team_games_diff = team_day_diff(df, team_id=89)
print(team_games_diff[['date', 'day_diff']].head())