In [9]:
import pandas as pd
import glob
import os

# Specify the folder path where CSV files are stored
folder_path = 'data/'

# Use glob to find all CSV files in the specified folder
all_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Use a list comprehension to read each CSV file into a DataFrame
df_list = []
for file in all_files:
    try:
        buli_df = pd.read_csv(file, encoding='ISO-8859-1')
        df_list.append(buli_df)
    except pd.errors.ParserError as e:
        print(f"ParserError parsing {file}: {e}")
    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError in {file}: {e}")
        
# Concatenate all DataFrames in the list into a single DataFrame
buli_df = pd.concat(df_list, ignore_index=True)

# Display the combined DataFrame
buli_df


Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BFECAHA,IWCH,IWCD,IWCA,VCCH,VCCD,VCCA,Unnamed: 70,Unnamed: 71,Unnamed: 72
0,D1,14/08/15,Bayern Munich,Hamburg,5,0,H,1,0,H,...,,,,,,,,,,
1,D1,15/08/15,Augsburg,Hertha,0,1,A,0,0,D,...,,,,,,,,,,
2,D1,15/08/15,Darmstadt,Hannover,2,2,D,1,0,H,...,,,,,,,,,,
3,D1,15/08/15,Dortmund,M'gladbach,4,0,H,3,0,H,...,,,,,,,,,,
4,D1,15/08/15,Leverkusen,Hoffenheim,2,1,H,1,1,D,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5575,D1,18/05/13,Hamburg,Leverkusen,0,1,A,0,0,D,...,,,,,,,,,,
5576,D1,18/05/13,Hannover,Fortuna Dusseldorf,3,0,H,1,0,H,...,,,,,,,,,,
5577,D1,18/05/13,M'gladbach,Bayern Munich,3,4,A,3,2,H,...,,,,,,,,,,
5578,D1,18/05/13,Nurnberg,Werder Bremen,3,2,H,0,1,A,...,,,,,,,,,,


In [11]:
#formatting the date column to datetime format and sorting by date
buli_df['Date'] = pd.to_datetime(buli_df['Date'])

buli_df.sort_values(['Date'], ascending=True, inplace=True)

  buli_df['Date'] = pd.to_datetime(buli_df['Date'])


In [13]:
#checking for null values
buli_df.isna().sum()

Div              72
Date              0
HomeTeam          0
AwayTeam          0
FTHG              0
               ... 
VCCD           4050
VCCA           4050
Unnamed: 70    5580
Unnamed: 71    5580
Unnamed: 72    5580
Length: 168, dtype: int64

In [17]:
#dropping rows & columns with all null values
buli_df.dropna(axis=1, how='all', inplace=True) #dropped 3 columns
buli_df.dropna(axis=0, how='all',inplace=True) #0 rows dropped

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AvgCAHH,AvgCAHA,BFECAHH,BFECAHA,IWCH,IWCD,IWCA,VCCH,VCCD,VCCA
2878,D1,2006-01-10,Hertha,Stuttgart,2,2,D,2,1,H,...,,,,,,,,,,
2879,D1,2006-01-10,Leverkusen,Schalke 04,3,1,H,1,1,D,...,,,,,,,,,,
2952,D1,2006-01-12,Mainz,Stuttgart,0,0,D,0,0,D,...,,,,,,,,,,
2953,D1,2006-02-12,Bayern Munich,M'gladbach,1,1,D,1,1,D,...,,,,,,,,,,
2954,D1,2006-02-12,Bielefeld,Leverkusen,0,0,D,0,0,D,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4805,D1,2024-12-01,Bayern Munich,Hoffenheim,3,0,H,1,0,H,...,1.85,2.01,,,,,,1.10,10.5,22.00
4914,D1,2024-12-04,Augsburg,Union Berlin,2,0,H,0,0,D,...,1.83,2.03,,,,,,2.10,3.5,3.50
4956,D1,2024-12-05,Darmstadt,Hoffenheim,0,6,A,0,5,A,...,1.98,1.87,,,,,,4.80,5.0,1.55
4957,D1,2024-12-05,Bayern Munich,Wolfsburg,2,0,H,2,0,H,...,2.00,1.84,,,,,,1.45,5.0,6.00


In [19]:
buli_df.isna().sum()

Div           72
Date           0
HomeTeam       0
AwayTeam       0
FTHG           0
            ... 
IWCD        4223
IWCA        4223
VCCH        4050
VCCD        4050
VCCA        4050
Length: 165, dtype: int64

In [29]:
buli_df_red = buli_df[['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']]

In [45]:
def get_past_stats(row, npm, df):
    # Define home and away teams for the current match
    team_h = row['HomeTeam']
    team_a = row['AwayTeam']
    
    # Define stats columns with subcategories for home and away teams
    stats = {
        'goals': {'scored': 'FTHG', 'conceded': 'FTAG'},
        'shots': {'taken': 'HS', 'conceived': 'AS'},
        'shots_on_target': {'taken': 'HST', 'conceived': 'AST'},
        'fouls': {'fouls': 'HF', 'fouled': 'AF'},
        'corners': {'taken': 'HC', 'conceived': 'AC'},
        'yellow_cards': {'received': 'HY', 'provoked': 'AY'},
        'red_cards': {'received': 'HR', 'provoked': 'AR'},
    }

    # Initialize results dictionary with match info first
    results = {
        'Date': row['Date'],
        'HomeTeam': team_h,
        'AwayTeam': team_a,
        'FTR': row['FTR']
    }

    # Calculate past stats for the home team
    past_matches_h = df[(df['HomeTeam'] == team_h) | (df['AwayTeam'] == team_h)]
    past_matches_h = past_matches_h[past_matches_h['Date'] < row['Date']].tail(npm)  # Last `npm` matches

    for stat, subcategories in stats.items():
        for subcategory, column in subcategories.items():
            if subcategory in ['scored', 'taken', 'fouls', 'received']:  # stats when home team is at home
                stat_as_home = past_matches_h[past_matches_h['HomeTeam'] == team_h][column].sum()
            else:  # stats when home team is away (e.g., conceived)
                stat_as_home = past_matches_h[past_matches_h['AwayTeam'] == team_h][column].sum()
            results[f'p_home_{stat}_{subcategory}_last_{npm}'] = stat_as_home

    # Calculate points for home team
    points_as_home_win = (past_matches_h[(past_matches_h['HomeTeam'] == team_h) & (past_matches_h['FTR'] == 'H')]).shape[0] * 3
    points_as_away_win = (past_matches_h[(past_matches_h['AwayTeam'] == team_h) & (past_matches_h['FTR'] == 'A')]).shape[0] * 3
    draws = (past_matches_h[((past_matches_h['HomeTeam'] == team_h) | (past_matches_h['AwayTeam'] == team_h)) & (past_matches_h['FTR'] == 'D')]).shape[0]
    results[f'p_home_points_last_{npm}'] = points_as_home_win + points_as_away_win + draws

    # Calculate past stats for the away team
    past_matches_a = df[(df['HomeTeam'] == team_a) | (df['AwayTeam'] == team_a)]
    past_matches_a = past_matches_a[past_matches_a['Date'] < row['Date']].tail(npm)  # Last `npm` matches

    for stat, subcategories in stats.items():
        for subcategory, column in subcategories.items():
            if subcategory in ['scored', 'taken', 'fouls', 'received']:  # stats when away team is at home
                stat_as_away = past_matches_a[past_matches_a['HomeTeam'] == team_a][column].sum()
            else:  # stats when away team is away (e.g., conceived)
                stat_as_away = past_matches_a[past_matches_a['AwayTeam'] == team_a][column].sum()
            results[f'p_away_{stat}_{subcategory}_last_{npm}'] = stat_as_away

    # Calculate points for away team
    points_as_home_win = (past_matches_a[(past_matches_a['HomeTeam'] == team_a) & (past_matches_a['FTR'] == 'H')]).shape[0] * 3
    points_as_away_win = (past_matches_a[(past_matches_a['AwayTeam'] == team_a) & (past_matches_a['FTR'] == 'A')]).shape[0] * 3
    draws = (past_matches_a[((past_matches_a['HomeTeam'] == team_a) | (past_matches_a['AwayTeam'] == team_a)) & (past_matches_a['FTR'] == 'D')]).shape[0]
    results[f'p_away_points_last_{npm}'] = points_as_home_win + points_as_away_win + draws

    return pd.Series(results)

# Apply the function to the DataFrame
combined_df = buli_df.apply(lambda row: get_past_stats(row, npm=7,df=buli_df), axis=1)  # Adjust npm as needed

# Ensure the initial match info columns are at the front
front_columns = ['Date', 'HomeTeam', 'AwayTeam', 'FTR']
reordered_columns = front_columns + [col for col in combined_df.columns if col not in front_columns]
combined_df = combined_df[reordered_columns]

combined_df

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,p_home_goals_scored_last_7,p_home_goals_conceded_last_7,p_home_shots_taken_last_7,p_home_shots_conceived_last_7,p_home_shots_on_target_taken_last_7,p_home_shots_on_target_conceived_last_7,...,p_away_shots_on_target_conceived_last_7,p_away_fouls_fouls_last_7,p_away_fouls_fouled_last_7,p_away_corners_taken_last_7,p_away_corners_conceived_last_7,p_away_yellow_cards_received_last_7,p_away_yellow_cards_provoked_last_7,p_away_red_cards_received_last_7,p_away_red_cards_provoked_last_7,p_away_points_last_7
2878,2006-01-10,Hertha,Stuttgart,D,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2879,2006-01-10,Leverkusen,Schalke 04,H,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2952,2006-01-12,Mainz,Stuttgart,D,0,0,0,0,0,0,...,9,0,22,0,2,0,3,0,0,1
2953,2006-02-12,Bayern Munich,M'gladbach,D,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2954,2006-02-12,Bielefeld,Leverkusen,D,0,0,0,0,0,0,...,0,27,0,5,0,4,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4805,2024-12-01,Bayern Munich,Hoffenheim,H,13,16,65,73,25,30,...,9,43,44,20,9,9,8,1,2,5
4914,2024-12-04,Augsburg,Union Berlin,H,9,1,80,16,28,4,...,16,42,58,15,20,9,7,0,0,11
4956,2024-12-05,Darmstadt,Hoffenheim,A,0,2,28,52,8,14,...,13,34,50,16,11,7,11,1,3,5
4957,2024-12-05,Bayern Munich,Wolfsburg,H,8,16,67,73,25,30,...,14,42,42,10,20,7,13,1,2,2


NameError: name 'p_away_goals_scored_last_7' is not defined

In [39]:
def get_past_stats(row, npm, df):
    # Define home and away teams for the current match
    team_h = row['HomeTeam']
    team_a = row['AwayTeam']
    
    # Define stats columns (excluding 'points' for simplicity)
    stats = {
        'goals_scored': {'home': 'FTHG', 'away': 'FTAG'},
        'goals_conceded': {'home': 'FTAG', 'away': 'FTHG'},
        'red_cards': {'home': 'HR', 'away': 'AR'},
        'shots_on_target': {'home': 'HST', 'away': 'AST'},
        # Add any other stat columns here
    }

    # Initialize results dictionary
    results = {}

    # Calculate past stats for the home team
    past_matches_h = df[(df['HomeTeam'] == team_h) | (df['AwayTeam'] == team_h)]
    past_matches_h = past_matches_h[past_matches_h['Date'] < row['Date']].tail(npm)  # Last `npm` matches

    for stat, columns in stats.items():
        stat_as_home = past_matches_h[past_matches_h['HomeTeam'] == team_h][columns['home']].sum()
        stat_as_away = past_matches_h[past_matches_h['AwayTeam'] == team_h][columns['away']].sum()
        results[f'p_home_{stat}_last_{npm}'] = stat_as_home + stat_as_away

    points_as_home_win = (past_matches_h[(past_matches_h['HomeTeam'] == team_h) & (past_matches_h['FTR'] == 'H')]).shape[0] * 3
    points_as_away_win = (past_matches_h[(past_matches_h['AwayTeam'] == team_h) & (past_matches_h['FTR'] == 'A')]).shape[0] * 3
    draws = (past_matches_h[((past_matches_h['HomeTeam'] == team_h) | (past_matches_h['AwayTeam'] == team_h)) & (past_matches_h['FTR'] == 'D')]).shape[0]
    results[f'p_home_points_last_{npm}'] = points_as_home_win + points_as_away_win + draws

    # Calculate past stats for the away team
    past_matches_a = df[(df['HomeTeam'] == team_a) | (df['AwayTeam'] == team_a)]
    past_matches_a = past_matches_a[past_matches_a['Date'] < row['Date']].tail(npm)  # Last `npm` matches

    for stat, columns in stats.items():
        stat_as_home = past_matches_a[past_matches_a['HomeTeam'] == team_a][columns['home']].sum()
        stat_as_away = past_matches_a[past_matches_a['AwayTeam'] == team_a][columns['away']].sum()
        results[f'p_away_{stat}_last_{npm}'] = stat_as_home + stat_as_away

    points_as_home_win = (past_matches_a[(past_matches_a['HomeTeam'] == team_a) & (past_matches_a['FTR'] == 'H')]).shape[0] * 3
    points_as_away_win = (past_matches_a[(past_matches_a['AwayTeam'] == team_a) & (past_matches_a['FTR'] == 'A')]).shape[0] * 3
    draws = (past_matches_a[((past_matches_a['HomeTeam'] == team_a) | (past_matches_a['AwayTeam'] == team_a)) & (past_matches_a['FTR'] == 'D')]).shape[0]
    results[f'p_away_points_last_{npm}'] = points_as_home_win + points_as_away_win + draws

    # Add original match information to results
    results['HomeTeam'] = team_h
    results['AwayTeam'] = team_a
    results['FTR'] = row['FTR']
    results['Date'] = row['Date']

    return pd.Series(results)

# Apply the function to the DataFrame
combined_df = buli_df.apply(get_past_stats, axis=1, npm=7,df=buli_df)  # Adjust npm as needed
combined_df

Unnamed: 0,p_home_goals_scored_last_7,p_home_goals_conceded_last_7,p_home_red_cards_last_7,p_home_shots_on_target_last_7,p_home_points_last_7,p_away_goals_scored_last_7,p_away_goals_conceded_last_7,p_away_red_cards_last_7,p_away_shots_on_target_last_7,p_away_points_last_7,HomeTeam,AwayTeam,FTR,Date
2878,0,0,0,0,0,0,0,0,0,0,Hertha,Stuttgart,D,2006-01-10
2879,0,0,0,0,0,0,0,0,0,0,Leverkusen,Schalke 04,H,2006-01-10
2952,0,0,0,0,0,2,2,0,9,1,Mainz,Stuttgart,D,2006-01-12
2953,0,0,0,0,0,0,0,0,0,0,Bayern Munich,M'gladbach,D,2006-02-12
2954,0,0,0,0,0,3,1,0,12,3,Bielefeld,Leverkusen,D,2006-02-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4805,29,6,0,55,16,10,15,3,30,5,Bayern Munich,Hoffenheim,H,2024-12-01
4914,10,15,1,32,7,8,6,0,24,11,Augsburg,Union Berlin,H,2024-12-04
4956,2,12,0,22,4,9,14,4,29,5,Darmstadt,Hoffenheim,A,2024-12-05
4957,24,5,0,55,16,8,15,3,26,2,Bayern Munich,Wolfsburg,H,2024-12-05


In [33]:
buli_df_red
def craft_past_stats_home_team(number_past_games, home_team):
    for past_game in range(number_past_games):
        
# Function to get past 7 matches stats for a given team
def get_past_stats(row, npm):
    # Find all past matches where the current home team participated
    team_h = row['home_team']
    past_matches = df[(df['home_team'] == team_h) | (df['away_team'] == team_h)]
    past_matches = past_matches[past_matches['datetime'] < row['datetime']].tail(npm)  # Get only the last 7 matches

    # Calculate total goals scored by the team in the past 7 games
    goals_scored_as_home = past_matches[past_matches['HomeTeam'] == team_h]['FTHG'].sum()
    goals_scored_as_away = past_matches[past_matches['AwayTeam'] == team_h]['FTAG'].sum()
    p_hg_sco = goals_scored_as_home + goals_scored_as_away

    # Calculate total goals conceded by the team in the past 7 games
    goals_conceded_as_home = past_matches[past_matches['HomeTeam'] == team_h]['FTAG'].sum()
    goals_conceded_as_away = past_matches[past_matches['AwayTeam'] == team_h]['FTHG'].sum()
    p_hg_con = goals_conceded_as_home + goals_conceded_as_away

    # Calculate total points gained through results in the past 7 games
    points_scored_by_wins_as_home = past_matches['FTR']=='H'].sum() * 3
    points_scored_by_wins_as_away = past_matches['FTR']=='A'].sum() * 3
    p_hr_pts = past_matches['FTR']=='D'].sum() + points_scored_by_wins_as_home + points_scored_by_wins_as_away
    

    p_hs_taken
    p_hs_con
    
    p_as_taken
    p_as_con

    p_hst_taken
    p_hst_con
    
    p_ast_taken
    p_ast_con

    p_h_fouls
    p_h_fouled

    p_a_fouls
    p_a_fouled

    p_hc_taken
    p_hc_con

    p_ac_taken
    p_ac_con

    p_hy_rec
    p_hy_prov

    p_ay_rec
    p_ay_prov

    p_hr_rec
    p_hr_prov

    p_ar_rec
    p_ar_prov

    ##Same for the away team
    
    # Find all past matches where the current home team participated
    team_a = row['away_team']
    past_matches = df[(df['home_team'] == team_a) | (df['away_team'] == team_a)]
    past_matches = past_matches[past_matches['datetime'] < row['datetime']].tail(npm)  # Get only the last 7 matches

    # Calculate stats (example: total goals scored by the team in the past 7 games)
    goals_scored_as_home = past_matches[past_matches['home_team'] == team_a]['home_goals'].sum()
    goals_scored_as_away = past_matches[past_matches['away_team'] == team_a]['away_goals'].sum()
    p_ag_sco = goals_scored_as_home + goals_scored_as_away

    # Calculate more stats as needed (e.g., total goals conceded)
    goals_conceded_as_home = past_matches[past_matches['home_team'] == team_a]['away_goals'].sum()
    goals_conceded_as_away = past_matches[past_matches['away_team'] == team_a]['home_goals'].sum()
    p_ag_con = goals_conceded_as_home + goals_conceded_as_away

    p_hr_pts
    p_ar_pts

    p_hs_taken
    p_hs_con
    
    p_as_taken
    p_as_con

    p_hst_taken
    p_hst_con
    
    p_ast_taken
    p_ast_con

    p_h_fouls
    p_h_fouled

    p_a_fouls
    p_a_fouled

    p_hc_taken
    p_hc_con

    p_ac_taken
    p_ac_con

    p_hy_rec
    p_hy_prov

    p_ay_rec
    p_ay_prov

    p_hr_rec
    p_hr_prov

    p_ar_rec
    p_ar_prov

    return pd.Series({
        'goals_scored_last_7': total_goals_scored,
        'goals_conceded_last_7': total_goals_conceded
    })

# Apply the function to each row to get the past stats
df[['goals_scored_last_7', 'goals_conceded_last_7']] = df.apply(get_past_stats, axis=1)

print(df)   

IndentationError: expected an indented block after 'for' statement on line 3 (3480831898.py, line 6)