# **Importing libraries,Exploratory data Analysis,and Feature engineering**

In [1]:
import pandas as pd

# Load data from CSV file
data = pd.read_excel("Final Project Dataset.xlsx", sheet_name="Final Project Dataset")

In [2]:
data.head()

Unnamed: 0,id,id_race,id_horse,horse_name,jockey,trainer,age,weight,number,last_ran_days_ago,non_runner,form,position,distance_beaten,owner,sire,dam,official_rating,starting_price,price
0,1,236933,268310,Roger Pol(IRE),Gavin Sheehan,Jamie Snowden,5,2024-11-04 00:00:00,5,30.0,0,1931-2,1,,Cobbold Allen Ogilvy Shaw Morley,SHANTOU(USA),LAREN(GER),,1.8,£4629
1,2,236933,270046,Call The Dance,Nico de Boinville,Nicky Henderson,5,2024-10-11 00:00:00,7,187.0,0,41-,2,2.75,James &amp; Jean Potter Ltd,KAYF TARA,HORA,,2.62,£4629
2,3,236933,271305,Hiero Sport(FR),Kielan Woods,Alex Hales,6,2024-11-04 00:00:00,3,217.0,0,2-,3,38.0,,,,,10.0,£4629
3,4,236933,268599,Largy Ray(IRE),Ben Poste,Clare Hobson,5,2024-11-04 00:00:00,4,15.0,0,575-1R,4,30.0,Raymond Scullion/Martin McGrogan,WESTERNER,DEBUT(IRE),,101.0,£4629
4,5,236933,270748,Borodale(IRE),Sean Bowen,Olly Murphy,5,2024-11-04 00:00:00,1,24.0,0,44-5,bd,,Mrs Diana L. Whateley,FLEMENSFIRTH(USA),PORTRYAN NATIVE(IRE),,21.0,£4629


In [3]:
# features_to_drop = ["id", "id_race", "id_horse", "weight", "distance_beaten", "position", "form"]

features = ["age",	"last_ran_days_ago", "non_runner", "official_rating", "starting_price", "position"]

In [4]:
new_df = data[features].dropna()


In [5]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6679 entries, 61 to 11590
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                6679 non-null   int64  
 1   last_ran_days_ago  6679 non-null   float64
 2   non_runner         6679 non-null   int64  
 3   official_rating    6679 non-null   float64
 4   starting_price     6679 non-null   float64
 5   position           6679 non-null   object 
dtypes: float64(3), int64(2), object(1)
memory usage: 365.3+ KB


In [6]:
y = new_df['position']

invalid_values = ['ur', 'pu', 'F', 'rr', 'bd', 'R', 'ro']

# Filter y to remove rows with invalid values
new_df = new_df[~y.isin(invalid_values)]
new_df['position'] = new_df['position'].astype(int)

X = new_df.drop("position", axis=1)
y = new_df['position']


In [7]:
y.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22])

**Model Building without balancing the dataset(tried SVM)**

In [8]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Support Vector Classifier (SVC) with a linear kernel
svm_classifier = SVC()

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)


In [9]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.12469033856317094


**Accuracy seems very very low ,so we have to balance the dataset**

# **Balancing the dataset**

In [10]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler

# Oversampling the minority classes
oversample = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = oversample.fit_resample(X, y)

# Creating a DataFrame from the oversampled data
# oversampled_df = pd.DataFrame({'position': X_resampled.flatten(), 'count': y_resampled})

# print(X_resampled)
print(y_resampled.value_counts())


1     673
2     673
21    673
20    673
19    673
18    673
17    673
16    673
15    673
14    673
13    673
12    673
11    673
10    673
9     673
8     673
7     673
6     673
5     673
4     673
3     673
22    673
Name: position, dtype: int64


After Resampling

# **SVM**

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize the Support Vector Classifier (SVC) with a linear kernel
svm_classifier = SVC()

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)


In [12]:
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.3089128966914247


# **GradientBoostingClassifier**

In [13]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Initialize the Gradient Boosting Classifier
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
gbm.fit(X_train, y_train)

# Make predictions on the test set
predictions = gbm.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.5344361917623227


**Evaluate Gradient Boosting**

In [14]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, predictions))

# Generate confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, predictions, average='weighted')
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, predictions, average='weighted')
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, predictions, average='weighted')
print("F1 Score:", f1)


Classification Report:
              precision    recall  f1-score   support

           1       0.17      0.22      0.19       126
           2       0.25      0.17      0.20       164
           3       0.15      0.16      0.15       140
           4       0.17      0.10      0.13       127
           5       0.19      0.14      0.16       129
           6       0.10      0.06      0.07       154
           7       0.15      0.13      0.14       113
           8       0.36      0.17      0.23       151
           9       0.31      0.27      0.29       127
          10       0.20      0.30      0.24       125
          11       0.39      0.48      0.43       158
          12       0.51      0.45      0.48       135
          13       0.43      0.65      0.52       138
          14       0.60      0.66      0.63       149
          15       0.74      1.00      0.85       140
          16       0.80      1.00      0.89       129
          17       0.94      1.00      0.97       128
    

# **xgboost**

In [15]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Adjust class labels to start from 0
y_adjusted = y_train - min(y_train)

# Initialize the XGBoost Classifier
xgb_classifier = xgb.XGBClassifier(objective="multi:softmax", num_class=len(set(y_adjusted)), seed=42, base_score=min(y_adjusted))

# Train the model
xgb_classifier.fit(X_train, y_adjusted)

# Make predictions on the test set
predictions = xgb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.035111411208642807


# **Cat Boost**

In [17]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [18]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Initialize the CatBoost Classifier
catboost_classifier = CatBoostClassifier(iterations=100, learning_rate=0.1, random_state=42)

# Train the model
catboost_classifier.fit(X_train, y_train, verbose=100)

# Make predictions on the test set
predictions = catboost_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


0:	learn: 2.8053056	total: 122ms	remaining: 12.1s
99:	learn: 1.4393160	total: 8.49s	remaining: 0us
Accuracy: 0.5074274139095206


# **LGBM**

In [19]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Adjust class labels to start from 0
y_train_adjusted = y_train - min(y_train)
y_test_adjusted = y_test - min(y_test)

# Convert the data into LightGBM dataset format
train_data = lgb.Dataset(X_train, label=y_train_adjusted)
test_data = lgb.Dataset(X_test, label=y_test_adjusted)

# Set parameters for LightGBM
parameters = {
    'objective': 'multiclass',
    'num_class': len(set(y_train_adjusted)),
    'metric': 'multi_logloss',
    'random_state': 42
}

# Train the model with early stopping
num_round = 100
bst = lgb.train(parameters, train_data, num_round, valid_sets=[test_data])

# Make predictions on the test set
predictions = bst.predict(X_test, num_iteration=bst.best_iteration)

# Convert predictions to class labels
predicted_labels = [list(x).index(max(x)) for x in predictions]

# Calculate accuracy
accuracy = accuracy_score(y_test_adjusted, predicted_labels)
print("Accuracy:", accuracy)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028731 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 425
[LightGBM] [Info] Number of data points in the train set: 11844, number of used features: 4
[LightGBM] [Info] Start training from score -3.075128
[LightGBM] [Info] Start training from score -3.147129
[LightGBM] [Info] Start training from score -3.101055
[LightGBM] [Info] Start training from score -3.076958
[LightGBM] [Info] Start training from score -3.080627
[LightGBM] [Info] Start training from score -3.127673
[LightGBM] [Info] Start training from score -3.051640
[LightGBM] [Info] Start training from score -3.121909
[LightGBM] [Info] Start training from score -3.076958
[LightGBM] [Info] Start training from score -3.073301
[LightGBM] [Info] Start training from score -3.135410
[LightGBM] [Info] Start training from score -3.091718
[LightGBM] [Info] Start training from score -3.097310
[LightGBM] 

**Evaluating LGBM**

In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Convert predictions to class labels
predicted_labels = [list(x).index(max(x)) for x in predictions]

# Calculate accuracy
accuracy = accuracy_score(y_test_adjusted, predicted_labels)
print("Accuracy:", accuracy)

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_test_adjusted, predicted_labels))

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_adjusted, predicted_labels))

# Calculate precision
precision = precision_score(y_test_adjusted, predicted_labels, average='weighted')
print("\nPrecision:", precision)

# Calculate recall
recall = recall_score(y_test_adjusted, predicted_labels, average='weighted')
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test_adjusted, predicted_labels, average='weighted')
print("F1 Score:", f1)


Accuracy: 0.6924375422012155

Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.30      0.27       126
           1       0.31      0.22      0.26       164
           2       0.19      0.16      0.18       140
           3       0.16      0.12      0.14       127
           4       0.25      0.24      0.24       129
           5       0.49      0.37      0.42       154
           6       0.37      0.37      0.37       113
           7       0.52      0.44      0.48       151
           8       0.55      0.68      0.61       127
           9       0.55      0.73      0.63       125
          10       0.77      0.84      0.81       158
          11       0.75      0.88      0.81       135
          12       0.91      1.00      0.95       138
          13       0.94      1.00      0.97       149
          14       1.00      1.00      1.00       140
          15       0.98      1.00      0.99       129
          16       0.99     

# **So as we can see LGBM has the highest accuracy,f1,recall,precision over other algorithms .So we finalize it as the final model**

In [21]:
# Save the trained model in both .pkl and .sav formats using joblib
model_filename_pkl = "my_trained_lightgbm_model.pkl"
model_filename_sav = "my_trained_lightgbm_model.sav"

import joblib
joblib.dump(bst, model_filename_pkl)
joblib.dump(bst, model_filename_sav)

print(f"Model saved to: {model_filename_pkl}")
print(f"Model saved to: {model_filename_sav}")

Model saved to: my_trained_lightgbm_model.pkl
Model saved to: my_trained_lightgbm_model.sav
