#Libraries and Google Drive mount

In [None]:
import pandas as pd
from google.colab import drive
from google.colab import files

# Mount Google Drive
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/filtered_horse_racing_data.csv'

df = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Feature analysis and removing poorly corelated features


In [None]:
# Store the initial number of rows
initial_rows = len(df)

In [None]:
# Select only numerical features for correlation analysis, starting from the 50th column
numerical_df = df.select_dtypes(include=['number']).iloc[:, 50:]

# Calculate the correlation matrix
corr_matrix = numerical_df.corr()

In [None]:
# Set the threshold for high correlation
threshold = 0.4
# Find highly correlated features
highly_correlated_features = set()
for i in range(len(corr_matrix.columns)):
  for j in range(i):
    if abs(corr_matrix.iloc[i, j]) > threshold:
      colname1 = corr_matrix.columns[i]
      colname2 = corr_matrix.columns[j]
      highly_correlated_features.add((colname1, colname2))

# Get all unique column names from highly_correlated_features
columns_to_keep = set()
for col1, col2 in highly_correlated_features:
  columns_to_keep.add(col1)
  columns_to_keep.add(col2)

# Drop poorly correlated columns (those not in columns_to_keep)
df_filtered = numerical_df[list(columns_to_keep)]

# Print the remaining columns
print("Remaining columns after dropping poorly correlated columns:")
print(df_filtered.columns)

Remaining columns after dropping poorly correlated columns:
Index(['hrs_wgt_dbw_imp', 'hrs_lto_unfin_fin', 'hrs_avg_life_wins',
       'hrs_lr_days_off_hcap', 'hrs_wgt_relative', 'leader_sectional',
       'hrs_incident_rate', 'time_diff_per_mile', 'hrs_lr_race_type_key',
       'entry_number',
       ...
       'hrs_ema_5_dbw_imp', 'hrs_best_last_3_mordin_speed', 'hrs_ema_3_dbw',
       'hrs_class_wgt_fp', 'hrs_best_last_10_mordin_speed', 'rating_0to100',
       'hrs_avg_life_fp', 'hrs_lto_incident', 'hrs_lr_dst_win', 'handicap'],
      dtype='object', length=231)


In [None]:
# Create a binary 'won_race' column (1 if position_official is 1, 0 otherwise)
df['won_race'] = (df['position_official'] == 1).astype(int)

# Select only numerical features for correlation analysis
numerical_df = df.select_dtypes(include=['number'])

# Calculate the correlation matrix with 'won_race'
corr_matrix = numerical_df.corr()['won_race']
threshold = 0.4

# Find highly correlated features (positive or negative) with 'won_race'
highly_correlated_features = corr_matrix[
    (corr_matrix > threshold) | (corr_matrix < -threshold)
]

print("Highly correlated features with 'won_race':")
print(highly_correlated_features)

Highly correlated features with 'won_race':
position_official    -0.404082
position_past_post   -0.458184
ip_max               -0.837116
win                   0.997724
hrs_in_the_money      0.522665
hrs_close_fp          0.599259
hrs_good_race         0.481663
jok_close_fp          0.908002
won_race              1.000000
Name: won_race, dtype: float64
