In [1]:
import pandas as pd
import os
import glob

def concatenate_csv_files(file_pattern):
    # Get all files matching the file pattern
    all_files = glob.glob(file_pattern)
    
    # Debugging: Print the list of files found
    print(f"Files found for pattern {file_pattern}: {all_files}")
    
    if not all_files:
        raise ValueError(f"No files found for pattern {file_pattern}")
    
    # Read each file and concatenate into a single DataFrame
    df_list = [pd.read_csv(file) for file in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    
    return combined_df

def merge_dataframes(df1, df2, key):
    # Merge the DataFrames on the specified key
    combined_df = pd.merge(df1, df2, on=key)
    
    return combined_df

# Define paths to horse and race data directories
base_path = 'F:\Education and Job\Guvi\Placement prep projects\Horse_Race_Prediction\Data_Sets_1'
horse_data_path = os.path.join(base_path, 'horse_data_cs')
race_data_path = os.path.join(base_path, 'race_data_cs')

# Step 1: Concatenate all horse files
horse_file_pattern = os.path.join(horse_data_path, '*.csv')
horse_df = concatenate_csv_files(horse_file_pattern)

# Step 2: Concatenate all race files
race_file_pattern = os.path.join(race_data_path, '*.csv')
race_df = concatenate_csv_files(race_file_pattern)

# Step 3: Merge the concatenated horse and race DataFrames
# Assuming 'rid' is the key in both horse_df and race_df
combined_df = merge_dataframes(horse_df, race_df, 'rid')

# Save the combined DataFrame to a CSV file for future use
output_path = 'F:\Education and Job\Guvi\Placement prep projects\Horse_Race_Prediction\Combined_data\combined_horse_race_data.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
combined_df.to_csv(output_path, index=False)

# Print the shape of the combined DataFrame
print(combined_df.shape)


  base_path = 'F:\Education and Job\Guvi\Placement prep projects\Horse_Race_Prediction\Data_Sets_1'
  output_path = 'F:\Education and Job\Guvi\Placement prep projects\Horse_Race_Prediction\Combined_data\combined_horse_race_data.csv'


Files found for pattern F:\Education and Job\Guvi\Placement prep projects\Horse_Race_Prediction\Data_Sets_1\horse_data_cs\*.csv: ['F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\horse_data_cs\\horses_1990.csv', 'F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\horse_data_cs\\horses_1991.csv', 'F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\horse_data_cs\\horses_1992.csv', 'F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\horse_data_cs\\horses_1993.csv', 'F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\horse_data_cs\\horses_1994.csv', 'F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\horse_data_cs\\horses_1995.csv', 'F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\horse_data_cs\\horses_1996.csv', 'F

  df_list = [pd.read_csv(file) for file in all_files]
  df_list = [pd.read_csv(file) for file in all_files]


Files found for pattern F:\Education and Job\Guvi\Placement prep projects\Horse_Race_Prediction\Data_Sets_1\race_data_cs\*.csv: ['F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\race_data_cs\\races_1990.csv', 'F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\race_data_cs\\races_1991.csv', 'F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\race_data_cs\\races_1992.csv', 'F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\race_data_cs\\races_1993.csv', 'F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\race_data_cs\\races_1994.csv', 'F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\race_data_cs\\races_1995.csv', 'F:\\Education and Job\\Guvi\\Placement prep projects\\Horse_Race_Prediction\\Data_Sets_1\\race_data_cs\\races_1996.csv', 'F:\\Education an

In [2]:
combined_df.shape

(4107315, 46)

In [3]:
combined_df.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,...,condition,hurdles,prizes,winningTime,prize,metric,countryCode,ncond,class,currency
0,271018,Combermere,6.0,0.0,0.222222,0,R G Frost,J Frost,1,,...,Soft,19 fences,"[2922.5, 875.0, 420.0, 192.5]",398.3,4409.0,5028.0,GB,5,0,
1,271018,Royal Battery,6.0,0.0,0.090909,0,D H Barons,S Earle,2,10,...,Soft,19 fences,"[2922.5, 875.0, 420.0, 192.5]",398.3,4409.0,5028.0,GB,5,0,
2,271018,Just So,7.0,0.0,0.029412,0,J D Roberts,S Burrough,3,15,...,Soft,19 fences,"[2922.5, 875.0, 420.0, 192.5]",398.3,4409.0,5028.0,GB,5,0,
3,271018,Mandraki Shuffle,8.0,0.0,0.090909,0,Oliver Sherwood,M Richards,4,20,...,Soft,19 fences,"[2922.5, 875.0, 420.0, 192.5]",398.3,4409.0,5028.0,GB,5,0,
4,271018,Turnberry Dawn,8.0,0.0,0.047619,0,T B Hallett,P Richards,5,dist,...,Soft,19 fences,"[2922.5, 875.0, 420.0, 192.5]",398.3,4409.0,5028.0,GB,5,0,


In [4]:
combined_df.columns

Index(['rid', 'horseName', 'age', 'saddle', 'decimalPrice', 'isFav',
       'trainerName', 'jockeyName', 'position', 'positionL', 'dist',
       'weightSt', 'weightLb', 'overWeight', 'outHandicap', 'headGear', 'RPR',
       'TR', 'OR', 'father', 'mother', 'gfather', 'runners', 'margin',
       'weight', 'res_win', 'res_place', 'price', 'course', 'time', 'date',
       'title', 'rclass', 'band', 'ages', 'distance', 'condition', 'hurdles',
       'prizes', 'winningTime', 'prize', 'metric', 'countryCode', 'ncond',
       'class', 'currency'],
      dtype='object')