# Model

In [23]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
import warnings
from sklearn.tree import export_text
from sklearn.feature_selection import RFE
%matplotlib inline
warnings.filterwarnings(action='once')
sns.set_theme(style="darkgrid")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\aadik\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\aadik\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\aadik\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\aadik\anaconda3\Lib\site-pack

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ImportError: initialization failed

In [None]:
# Read in the data from preprocessing and feature engineering
result_df = pd.read_csv('Data/Modeling_v1.csv')
result_df.drop(columns=['Unnamed: 0'], inplace=True)
result_df.top_3 = result_df.top_3.astype('int')

In [None]:
# Checking What columns we have
result_df.columns

In [None]:
# getting info about the numerical data in the data set
result_df[['top_3', 'grid', 'top3_driver_season_percentage', 'Constructor_Top3_Percent', 'driver_avg_finish_pos_season', 'Top_3_at_circuit']].describe().T

In [None]:
print(f'class 1: {result_df[result_df.top_3 == True].top_3.count() / result_df.shape[0]:.2%}')

We have an imbalanced dataset

In [None]:
# Distribution of the target which is positionOrder
fig,ax = plt.subplots(figsize=(10,8))
sns.countplot(data=result_df, x='top_3_label', ax=ax)
ax.set_title('Top 3 Distribution')
ax.set_xlabel('Top 3')
ax.set_ylabel('Drivers')
plt.savefig('Images/TargetDistribution.png')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.histplot(data=result_df, x='grid',bins=35, ax=ax)
ax.set_title('Qualifying Position Distribution')
ax.set_xlabel('Qualifying Position')
ax.set_ylabel('Drivers')
plt.savefig(f'Images/GridDistribution.png')
plt.show()

Normal distribution, as expected trailing off near the end because some season had many more drivers

In [None]:
# Getting Distribution Plots
features = ['driver_avg_finish_pos_season', 'top3_driver_season_percentage', 'driver_avg_finish_pos_season_lag', 'top3_driver_season_percentage_lag', 'Constructor_Top3_Percent', 'Top_3_at_circuit']
fig, ax = plt.subplots(figsize=(10,8))
sns.histplot(data=result_df, x='driver_avg_finish_pos_season')
ax.set_title('Average Finishing Position Distribution')
ax.set_xlabel('Average Finishing Position')
ax.set_ylabel('Drivers')
plt.savefig('Images/driver_avg_finish_pos_seasonDistribution.png')
plt.show()

Normal distribution, this is what is expected after a season certain people will fall above or below the middle position but most will be right around there for their average finishing position

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.histplot(data=result_df, x='top3_driver_season_percentage')
ax.set_title('Driver Top 3 Percentage Distribution')
ax.set_xlabel('Top 3 Percentage')
ax.set_ylabel('Drivers')
ax.set_ylim(top=2000)
plt.savefig('Images/top3_driver_season_percentageDistribution.png')
plt.show()

Most people would not finish in the top 3 in a season and most of the people who do will only finish in the top 3 a few times except for an elite few

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.histplot(data=result_df, x='Constructor_Top3_Percent')
ax.set_title('Constructor Top 3 Percentage Distribution')
ax.set_xlabel('Top 3 Percentage')
ax.set_ylabel('Drivers')
ax.set_ylim(top=2000)
plt.savefig('Images/Constructor_Top3_PercentDistribution.png')
plt.show()

Provides same insight as above except for teams only a few teams will finish in the top 3 and they will only do it a few times a season except for an elite few

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.histplot(data=result_df, x='Top_3_at_circuit')
ax.set_title('Top 3 at Circuit Percentage Distribution')
ax.set_xlabel('Top 3 at Circuit Percentage')
ax.set_ylabel('Drivers')
ax.set_ylim(top=2000)
plt.savefig('Images/Constructor_Top3_PercentDistribution.png')
plt.show()

Right skewed distribution, makes sense it is hard to finish in the top 3 at a circuit multiple times

### Getting general overview of relationships

In [None]:
# making a correlation map
corr = result_df[['top_3', 'driver_avg_finish_pos_season', 'Constructor_Top3_Percent',
                  'top3_driver_season_percentage_lag', 'driver_avg_finish_pos_season_lag',
                  'Constructor_Top3_Percent_lag',
                  'top3_driver_season_percentage', 'grid', 'Top_3_at_circuit']].corr()
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(corr, annot=True, ax=ax)
plt.savefig('Images/CorrelationMap.png', bbox_inches="tight")
plt.show()

Some features have a good correlation with the top_3, also some collinearity happening

### Taking closer look at varaibles that are closely correlated with positionOrder

In [None]:
fig, ax = plt.subplots(figsize=(10,8), sharex=True, sharey=True, nrows=2)
sns.histplot(result_df[result_df.top_3 == 1], x='driver_avg_finish_pos_season', ax=ax[0], color='green')
sns.histplot(result_df[result_df.top_3 == 0], x='driver_avg_finish_pos_season', ax=ax[1], color='red')
ax[0].set_title('Top 3')
ax[1].set_title('Not Top 3')
ax[1].set_ylabel('')
ax[0].set_ylabel('')
ax[1].set_xlabel('')
plt.suptitle('Distribution Top 3 and Not Top 3 vs Average Finishing Position', fontsize=12, fontweight='bold')
fig.supylabel('Drivers')
fig.supxlabel('Average Finishing Position')
plt.savefig('Images/PositionOrderVsdriver_avg_finish_pos_season.png')
plt.show()

Could be a good feature to predict top 3 as the distributions are noticably different with the mean for drivers in the top 3 being lower and the whole distribution shifted left wards compared to drivers who didn't finish in the top 3

In [None]:
fig, ax = plt.subplots(figsize=(10,8), sharex=True, sharey=True, nrows=2)
sns.histplot(result_df[result_df.top_3 == 1], x='driver_avg_finish_pos_season_lag', ax=ax[0], color='green')
sns.histplot(result_df[result_df.top_3 == 0], x='driver_avg_finish_pos_season_lag', ax=ax[1], color='red')
ax[0].set_title('Top 3')
ax[1].set_title('Not Top 3')
ax[1].set_ylabel('')
ax[0].set_ylabel('')
ax[1].set_xlabel('')
plt.suptitle('Distribution Top 3 and Not Top 3 vs Average Finishing Position Last Season', fontsize=12, fontweight='bold')
fig.supylabel('Drivers')
fig.supxlabel('Average Finishing Position Last Season')
plt.savefig('Images/driver_avg_pos_season_lag.png')
plt.show()

Could be a good predictor as the distributions are different

In [None]:
fig, ax = plt.subplots(figsize=(10,8), sharex=True, sharey=True, nrows=2)
sns.histplot(result_df[result_df.top_3 == 1], x='top3_driver_season_percentage', ax=ax[0], color='green')
sns.histplot(result_df[result_df.top_3 == 0], x='top3_driver_season_percentage', ax=ax[1], color='red')
ax[1].set_ylim(top=1000)
ax[0].set_title('Top 3')
ax[1].set_title('Not Top 3')
ax[1].set_ylabel('')
ax[0].set_ylabel('')
ax[1].set_xlabel('')
plt.suptitle('Distribution Top 3 and Not Top 3 vs Top 3 Driver Percentage', fontsize=12, fontweight='bold')
fig.supylabel('Drivers')
fig.supxlabel('Top 3 Percentage This Season')
plt.savefig('Images/top3_driver_season_percentage.png')
plt.show()

Could be a good predictor as the distributions are very different 

In [None]:
fig, ax = plt.subplots(figsize=(10,8), sharex=True, sharey=True, nrows=2)
sns.histplot(result_df[result_df.top_3 == 1], x='top3_driver_season_percentage_lag', ax=ax[0], color='green')
sns.histplot(result_df[result_df.top_3 == 0], x='top3_driver_season_percentage_lag', ax=ax[1], color='red')
ax[1].set_ylim(top=1000)
ax[0].set_title('Top 3')
ax[1].set_title('Not Top 3')
ax[1].set_ylabel('')
ax[0].set_ylabel('')
ax[1].set_xlabel('')
plt.suptitle('Distribution Top 3 and Not Top 3 vs Driver Top 3 Percentage Last Year', fontsize=12, fontweight='bold')
fig.supylabel('Drivers')
fig.supxlabel('Top 3 Percentage Last Season')
plt.savefig('Images/top3_driver_season_percentage_lag.png')
plt.show()

Could be a good predictor as the distributions look to be slightly different

In [None]:
fig, ax = plt.subplots(figsize=(10,8), sharex=True, sharey=True, nrows=2)
sns.histplot(result_df[result_df.top_3 == 1], x='Constructor_Top3_Percent', ax=ax[0], color='green')
sns.histplot(result_df[result_df.top_3 == 0], x='Constructor_Top3_Percent', ax=ax[1], color='red')
ax[1].set_ylim(top=1000)
ax[0].set_title('Top 3')
ax[1].set_title('Not Top 3')
ax[1].set_ylabel('')
ax[0].set_ylabel('')
ax[1].set_xlabel('')
plt.suptitle('Distribution Top 3 and Not Top 3 vs Constructor Top 3 Percentage', fontsize=12, fontweight='bold')
fig.supylabel('Drivers')
fig.supxlabel('Constructor Top 3 Percentage')
plt.savefig('Images/Constructor_Top_3Percent.png')
plt.show()

Could be a good predictor distributions look slightly different

In [None]:
fig, ax = plt.subplots(figsize=(10,8), sharex=True, sharey=True, nrows=2)
sns.histplot(result_df[result_df.top_3 == 1], x='grid', ax=ax[0], bins=10, color='green')
sns.histplot(result_df[result_df.top_3 == 0], x='grid', ax=ax[1], bins=10, color='red')
ax[0].set_title('Top 3')
ax[1].set_title('Not Top 3')
ax[1].set_ylabel('')
ax[0].set_ylabel('')
ax[1].set_xlabel('')
plt.suptitle('Distribution Top 3 and Not Top 3 vs Qualifying Position', fontsize=12, fontweight='bold')
fig.supylabel('Drivers')
fig.supxlabel('Qualifying Position')
plt.savefig('Images/grid.png')
plt.show()

Could be a very good predictor as the distributions are very different

In [None]:
fig, ax = plt.subplots(figsize=(10,8), sharex=True, sharey=True, nrows=2)
sns.histplot(result_df[result_df.top_3 == 1], x='Top_3_at_circuit', ax=ax[0], bins=10, color='green')
sns.histplot(result_df[result_df.top_3 == 0], x='Top_3_at_circuit', ax=ax[1], bins=10, color='red')
ax[1].set_ylim(top=2000)
ax[0].set_title('Top 3')
ax[1].set_title('Not Top 3')
ax[1].set_ylabel('')
ax[0].set_ylabel('')
ax[1].set_xlabel('')
plt.suptitle('Distribution Top 3 and Not Top 3 vs Top 3 Percentage at Circuit', fontsize=12, fontweight='bold')
fig.supylabel('Drivers')
fig.supxlabel('Top 3 at Circuit Percentage')
plt.savefig('Images/top_3_at_circuit.png')
plt.show()

Could be a good predictor as the distributions are slightly different

## Modeling

In [None]:
# Splitting our data based on years to avoid data leakage
features = ['driver_avg_finish_pos_season', 'top3_driver_season_percentage', 'driver_avg_finish_pos_season_lag', 'top3_driver_season_percentage_lag', 'Constructor_Top3_Percent', 'grid', 'Top_3_at_circuit', 'Constructor_Top3_Percent_lag']
result_df.set_index(result_df['year'], inplace=True)
train = result_df['1980-01-01':'2009-01-01']
test = result_df['2010-01-01':]
train.reset_index(inplace=True, drop=True)
X_train = train[features]
X_test = test[features]
y_train = train['top_3']
y_test = test['top_3']
print(train.shape)
print(test.shape)
tss = TimeSeriesSplit(n_splits=5)  # Getting cross validation data split based off of date

In [None]:
# starting our modeling with a Decision Tree
params = {
    'criterion': ['entropy', 'log_loss', 'gini'],
    'max_depth': list(range(3,15)),
    'min_samples_split': list(range(30, 50)),
    'min_impurity_decrease':[.0001, .001, .01, .1]
}
DecisionTreeGridSearch = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid=params, n_jobs=-1, scoring='f1', cv=tss.split(X_train, y_train), verbose=True)
DecisionTreeGridSearch.fit(X_train, y_train)
best_decision_tree = DecisionTreeGridSearch.best_estimator_

In [None]:
# Getting Performance metrics for the decision tree
test_prediction = best_decision_tree.predict(X_test)
test_score = roc_auc_score(y_test, test_prediction)
train_pred = best_decision_tree.predict(X_train)
# Predict probabilities for the positive class
probabilities = best_decision_tree.predict_proba(X_test)[:, 1]

# Create a DataFrame with predictions and actual values
predictions_df = pd.DataFrame({'probabilities': probabilities, 'actual': y_test})

# Select the top 10% entries with the highest probability
top_10_percent = predictions_df.nlargest(int(len(X_test) * 0.1), 'probabilities')

# Calculate the number of actual positives in the top 10%
top_10_actual_positives = top_10_percent['actual'].sum()


# Calculate lift
lift = (top_10_actual_positives/top_10_percent.actual.count()) / (y_test.sum() / y_test.count())

print(f'Best score: {DecisionTreeGridSearch.best_score_: .2f}')
print(f'Best params: {DecisionTreeGridSearch.best_params_}')
print(f'Train roc_auc: {roc_auc_score(y_train, train_pred): .2}')
print(f'Test roc_auc: {test_score: .2}')
print(f'Train Accuracy: {best_decision_tree.score(X_train, y_train): .2%}')
print(f'Test Accuracy: {best_decision_tree.score(X_test, y_test): .2%}')
print(f'Train F1: {f1_score(y_train, train_pred): .2%}')
print(f'Test F1: {f1_score(y_test, test_prediction): .2%}')
print(f'Test recall: {recall_score(y_test, test_prediction): .2%}')
print(f'Test precision: {precision_score(y_test, test_prediction): .2%}')
print(f"Lift of the top 10%: {lift: .2}")

In [None]:
# getting a confusion matrix to better understand decision tree performance
cm = confusion_matrix(y_test, test_prediction, normalize='true')
fig, ax = plt.subplots(figsize=(10,8))
display = ConfusionMatrixDisplay(cm, display_labels=best_decision_tree.classes_)
display.plot(ax=ax)
plt.show()

In [None]:
# Dummy Classifier
dummy = DummyClassifier(random_state=42)
dummy.fit(X_train, y_train)
train_pred = dummy.predict(X_train)
test_pred = dummy.predict(X_test)
probabilities = dummy.predict_proba(X_test)[:, 1]

# Create a DataFrame with predictions and actual values
predictions_df = pd.DataFrame({'probabilities': probabilities, 'actual': y_test})

# Select the top 10% entries with the highest probability
top_10_percent = predictions_df.nlargest(int(len(X_test) * 0.1), 'probabilities')

# Calculate the number of actual positives in the top 10%
top_10_actual_positives = top_10_percent['actual'].sum()


# Calculate lift
lift = (top_10_actual_positives/top_10_percent.actual.count()) / (y_test.sum() / y_test.count())

print(f'Train accuracy: {dummy.score(X_train, y_train): .2%}')
print(f'Test accuracy: {dummy.score(X_test, y_test): .2%}')
print(f'Train roc_auc: {roc_auc_score(y_train, train_pred): .2}')
print(f'Test roc_auc: {roc_auc_score(y_test, test_pred): .2}')
print(f'Train F1: {f1_score(y_train, train_pred): .2%}')
print(f'Test F1: {f1_score(y_test, test_pred): .2%}')
print(f'Test recall: {recall_score(y_test, test_pred): .2%}')
print(f'Test precision: {precision_score(y_test, test_pred): .2%}')
print(f'Lift of the top 10%: {lift: .2}')

In [None]:
# Now going to use a Random Forest
X_train_forest = train[['driver_avg_finish_pos_season', 'top3_driver_season_percentage', 'driver_avg_finish_pos_season_lag', 'top3_driver_season_percentage_lag', 'Constructor_Top3_Percent', 'grid', 'Top_3_at_circuit']]

X_test_forest = test[['driver_avg_finish_pos_season', 'top3_driver_season_percentage', 'driver_avg_finish_pos_season_lag', 'top3_driver_season_percentage_lag', 'Constructor_Top3_Percent', 'grid', 'Top_3_at_circuit']]

param_grid={
    'criterion': ['gini', 'entropy', 'log_loss'],
    'n_estimators': [35, 45, 50],
    'max_depth': list(range(5,15)),
    'min_samples_split': list(range(25,30)),
    'min_impurity_decrease':[.0001, .001]
}

randomforestgridsearch = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=param_grid, n_jobs=-1, scoring='f1', cv=tss.split(X_train_forest, y_train), verbose=True)
randomforestgridsearch.fit(X_train_forest, y_train)
best_forest = randomforestgridsearch.best_estimator_

In [None]:
# Random Forest Performance Metrics
pred = best_forest.predict(X_test_forest)
train_pred = best_forest.predict(X_train_forest)
test_prediction = best_forest.predict(X_test_forest)
# Predict probabilities for the positive class
probabilities = best_forest.predict_proba(X_test_forest)[:, 1]

# Create a DataFrame with predictions and actual values
predictions_df = pd.DataFrame({'probabilities': probabilities, 'actual': y_test})

# Select the top 10% entries with the highest probability
top_10_percent = predictions_df.nlargest(int(len(X_test_forest) * 0.1), 'probabilities')

# Calculate the number of actual positives in the top 10%
top_10_actual_positives = top_10_percent['actual'].sum()


# Calculate lift
lift = (top_10_actual_positives/top_10_percent.actual.count()) / (y_test.sum() / y_test.count())

print(f'Best score: {randomforestgridsearch.best_score_: .2%}')
print(f'Best params: {randomforestgridsearch.best_params_}')
print(f'Train Accuracy: {best_forest.score(X_train_forest, y_train): .2%}')
print(f'Test Accuracy: {best_forest.score(X_test_forest, y_test): .2%}')
print(f'Train roc_auc: {roc_auc_score(y_train, train_pred): .2f}')
print(f'Test roc_auc: {roc_auc_score(y_test, pred): .2f}')
print(f'Train F1: {f1_score(y_train, train_pred): .2%}')
print(f'Test F1: {f1_score(y_test, pred): .2%}')
print(f'Test recall: {recall_score(y_test, pred): .2%}')
print(f'Test precision: {precision_score(y_test, pred): .2%}')
print(f'Lift of the top 10%: {lift: .2f}')

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Load dataset
df = pd.read_csv("Data/Modeling_v1.csv")

# Select features and target
features = [
    'driver_avg_finish_pos_season',
    'top3_driver_season_percentage',
    'driver_avg_finish_pos_season_lag',
    'top3_driver_season_percentage_lag',
    'Constructor_Top3_Percent',
    'grid',
    'Top_3_at_circuit'
]
target = df['top_3'].astype(int)  # Convert True/False to 1/0

X = df[features]
y = target

# TimeSeriesSplit for reproducibility
tss = TimeSeriesSplit(n_splits=5)

# Define hyperparameters
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'n_estimators': [35, 45, 50],
    'max_depth': list(range(5, 15)),
    'min_samples_split': list(range(25, 30)),
    'min_impurity_decrease': [0.0001, 0.001]
}

# Run GridSearchCV
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=param_grid,
                           n_jobs=-1, scoring='f1', cv=tss.split(X, y), verbose=1)
grid_search.fit(X, y)
best_forest = grid_search.best_estimator_

# --- User Input Function ---
def predict_top3():
    print("\nEnter the following driver stats:")
    user_input = {
        'driver_avg_finish_pos_season': float(input("Driver Avg Finish Pos This Season: ")),
        'top3_driver_season_percentage': float(input("Top3 % This Season: ")),
        'driver_avg_finish_pos_season_lag': float(input("Driver Avg Finish Pos Last Season: ")),
        'top3_driver_season_percentage_lag': float(input("Top3 % Last Season: ")),
        'Constructor_Top3_Percent': float(input("Constructor Top3 % This Season: ")),
        'grid': int(input("Grid Position: ")),
        'Top_3_at_circuit': float(input("Top 3 Finishes at Circuit: "))
    }

    input_df = pd.DataFrame([user_input])
    prediction = best_forest.predict(input_df)[0]
    print("\n Prediction:", "Top 3 Finish" if prediction == 1 else "Not in Top 3")


# Call the prediction function
predict_top3()
