<a href="https://colab.research.google.com/github/ShubhamNapNap/djs-gdg-tasks/blob/main/Ml_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 200)
df = pd.read_csv("f1_dnf.csv")
df = df[[
    'resultId', 'year', 'round', 'grid', 'positionOrder', 'points', 'laps', 'dob', 'date', 'target_finish',
    'driverRef', 'forename', 'surname', 'nationality_x',
    'constructorRef', 'name', 'nationality_y',
    'circuitRef', 'name_y', 'location', 'country', 'lat', 'lng', 'alt'
]].copy()
df['dob'] = pd.to_datetime(df['dob'])
df['date'] = pd.to_datetime(df['date'])
df['driver_age_at_race'] = (df['date'] - df['dob']).dt.days / 365.25
df['driver_name'] = df['forename'] + ' ' + df['surname']
df = df.drop(columns=['forename', 'surname'])
df = df.loc[~df.duplicated(subset=["year", "round", "driverRef", "constructorRef"])]\
.reset_index(drop=True).copy()
df.head()

Unnamed: 0,resultId,year,round,grid,positionOrder,points,laps,dob,date,target_finish,driverRef,nationality_x,constructorRef,name,nationality_y,circuitRef,name_y,location,country,lat,lng,alt,driver_age_at_race,driver_name
0,2460,2002,13,11,4,3.0,77.0,1979-10-17,2002-08-18,1,raikkonen,Finnish,mclaren,Hungaroring,British,hungaroring,McLaren,Budapest,Hungary,47.5789,19.2486,264,22.836413,Kimi Räikkönen
1,11565,1981,1,23,21,0.0,16.0,1946-05-04,1981-03-15,0,watson,British,mclaren,Long Beach,British,long_beach,McLaren,California,USA,33.7651,-118.189,12,34.863792,John Watson
2,18661,1958,8,0,26,0.0,0.0,1930-03-11,1958-08-03,0,ruttman,American,maserati,Nürburgring,Italian,nurburgring,Maserati,Nürburg,Germany,50.3356,6.9475,578,28.396988,Troy Ruttman
3,25121,2021,8,19,16,0.0,69.0,1999-03-22,2021-06-27,0,mick_schumacher,German,haas,Red Bull Ring,American,red_bull_ring,Haas F1 Team,Spielberg,Austria,47.2197,14.7647,678,22.26694,Mick Schumacher
4,8863,1988,12,0,30,0.0,0.0,1963-05-12,1988-09-11,0,modena,Italian,eurobrun,Autodromo Nazionale di Monza,Italian,monza,Euro Brun,Monza,Italy,45.6156,9.28111,162,25.336071,Stefano Modena


In [None]:
df = df.drop(columns=['resultId', 'dob', 'date', 'driver_name','name', 'name_y', 'location', 'country', 'alt', 'circuitRef','nationality_y','nationality_x','laps','points'])
df.head()

Unnamed: 0,year,round,grid,positionOrder,target_finish,driverRef,constructorRef,lat,lng,driver_age_at_race
0,2002,13,11,4,1,raikkonen,mclaren,47.5789,19.2486,22.836413
1,1981,1,23,21,0,watson,mclaren,33.7651,-118.189,34.863792
2,1958,8,0,26,0,ruttman,maserati,50.3356,6.9475,28.396988
3,2021,8,19,16,0,mick_schumacher,haas,47.2197,14.7647,22.26694
4,1988,12,0,30,0,modena,eurobrun,45.6156,9.28111,25.336071


In [None]:
df.isna().sum()

Unnamed: 0,0
year,0
round,0
grid,0
positionOrder,0
target_finish,0
driverRef,0
constructorRef,0
lat,0
lng,0
driver_age_at_race,0


In [None]:
from sklearn.preprocessing import LabelEncoder

le_constructor = LabelEncoder()
df['constructorRef_encoded'] = le_constructor.fit_transform(df['constructorRef'])

le_driver = LabelEncoder()
df['driverRef_encoded'] = le_driver.fit_transform(df['driverRef'])

# Drop original columns if desired
df.drop(columns=['constructorRef', 'driverRef'], inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
x = df.drop(columns=['target_finish'])
y = df['target_finish']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

In [None]:
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)


In [None]:
print(f'Accuracy: {accuracy_score(y_test, y_pred):.3f}')
print(f'Precision: {precision_score(y_test, y_pred):.3f}')
print(f'Recall: {recall_score(y_test, y_pred):.3f}')
print(f'F1 Score: {f1_score(y_test, y_pred):.3f}')
print(f'ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.3f}')

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.924
Precision: 0.882
Recall: 0.851
F1 Score: 0.866
ROC-AUC: 0.973

Confusion Matrix:
[[1354   66]
 [  86  493]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      1420
           1       0.88      0.85      0.87       579

    accuracy                           0.92      1999
   macro avg       0.91      0.90      0.91      1999
weighted avg       0.92      0.92      0.92      1999

