In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

: 

In [10]:
import pandas as pd
import os

def create_synthetic_data():
    os.makedirs('data/races', exist_ok=True)
    os.makedirs('data/horses', exist_ok=True)

    # Create synthetic race data
    for year in range(1990, 2021):
        race_data = pd.DataFrame({
            'rid': range(1, 101),
            'race_name': [f"Race_{i}" for i in range(1, 101)],
            'country_code': ['US', 'UK', 'FR', 'AU'] * 25,
            'race_conditions': ['good', 'firm', 'soft', 'heavy'] * 25,
            'performance_metric': [round(x * 0.1, 2) for x in range(1, 101)]
        })
        race_data.to_csv(f"data/races/{year}.csv", index=False)

    # Create synthetic horse data
    for year in range(1990, 2021):
        horse_data = pd.DataFrame({
            'rid': range(1, 101),
            'horse_id': range(1001, 1101),
            'horse_name': [f"Horse_{i}" for i in range(1, 101)],
            'performance_metric': [round(x * 0.2, 2) for x in range(1, 101)]
        })
        horse_data.to_csv(f"data/horses/{year}.csv", index=False)

    # Create synthetic forward data
    forward_data = pd.DataFrame({
        'rid': range(1, 101),
        'average_odds': [round(x * 0.05, 2) for x in range(1, 101)],
        'current_rpr': [round(x * 1.1, 2) for x in range(1, 101)],
        'current_tr': [round(x * 0.9, 2) for x in range(1, 101)]
    })
    forward_data.to_csv("data/forward.csv", index=False)

if __name__ == "__main__":
    create_synthetic_data()
    print("Synthetic data created.")


Synthetic data created.


In [4]:
race_data_path = 'C:\\Users\\Saravanan\\OneDrive\\Desktop\\Dataset\\Race predict\\horses_2020.csv'
horse_data_path = 'C:\\Users\\Saravanan\\OneDrive\\Desktop\\Dataset\\Race predict\\races_2020.csv'
forward_data_path = 'C:\\Users\\Saravanan\\OneDrive\\Desktop\\Dataset\\Race predict\\forward.csv'

In [12]:
race_data = pd.concat([pd.read_csv(f"{race_data_path}{year}.csv") for year in range(1990, 2021)], ignore_index=True)
horse_data = pd.concat([pd.read_csv(f"{horse_data_path}{year}.csv") for year in range(1990, 2021)], ignore_index=True)
forward_data = pd.read_csv(forward_data_path)

Data Preprocessing

Handling Missing Values and Encoding Categorical Data

In [17]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Example dataframes (replace with your actual data loading)
race_data = pd.DataFrame({
    'race_name': ['Race1', 'Race2', 'Race3'],
    'country_code': ['US', 'UK', 'FR'],
    'race_conditions': ['Sunny', 'Rainy', 'Cloudy'],
    'numeric_column': [10, 20, 30]
})

horse_data = pd.DataFrame({
    'horse_name': ['Horse1', 'Horse2', 'Horse3'],
    'numeric_column': [15, None, 25]
})

# Check and align columns
required_columns = ['country_code', 'race_conditions']

for column in required_columns:
    if column not in horse_data.columns:
        # Add the column or handle the missing column as per your data handling strategy
        horse_data[column] = None  # Example: Adding None values for illustration

# Impute missing values for numeric data (if any)
imputer = SimpleImputer(strategy='mean')
numeric_cols = horse_data.select_dtypes(include=['number']).columns
horse_data[numeric_cols] = imputer.fit_transform(horse_data[numeric_cols])

# Impute missing values for categorical data
for column in required_columns:
    if pd.api.types.is_numeric_dtype(horse_data[column]):
        mode = horse_data[column].mode()[0]
    else:
        mode = horse_data[column].mode()[0]
        mode


KeyError: 0

In [13]:
# Fill missing values
imputer = SimpleImputer(strategy='mean')
race_data = pd.DataFrame(imputer.fit_transform(race_data), columns=race_data.columns)
horse_data = pd.DataFrame(imputer.fit_transform(horse_data), columns=horse_data.columns)

# Encoding categorical variables
label_encoders = {}
for column in ['country_code', 'race_conditions']:
    le = LabelEncoder()
    race_data[column] = le.fit_transform(race_data[column])
    label_encoders[column] = le


ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'Race_1'

In [None]:
# Feature engineering
# Example: Average past performance of horses
horse_data['avg_performance'] = horse_data.groupby('horse_id')['performance_metric'].transform('mean')

# Data Integration
data = pd.merge(race_data, horse_data, on='rid')
data = pd.merge(data, forward_data, on='rid')

# Select relevant features and target
features = data.drop(['outcome', 'rid'], axis=1)
target = data['outcome']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)


Exploratory Data Analysis (EDA)
Descriptive Statistics and Visualization

In [None]:
# Descriptive statistics
print(data.describe())

# Visualization
plt.figure(figsize=(12, 6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.show()

# Distribution plots
sns.histplot(data['performance_metric'])
plt.show()


Model Development
Handling Imbalanced Data and Model Training

In [None]:
# Handling imbalanced data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Model selection and training
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_res, y_train_res)


Hyperparameter Tuning

In [None]:
# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_res, y_train_res)

best_rf = grid_search.best_estimator_


Evaluation

In [None]:
# Predictions and evaluation
y_pred = best_rf.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='coolwarm')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


In [None]:
# Export data for visualization
data['predictions'] = best_rf.predict(features)
data.to_csv('predictions.csv', index=False)
