In [1]:
import os

dataset_files = [
    "drivers.csv", "constructors.csv", "races.csv", "results.csv",
    "qualifying.csv", "lap_times.csv", "pit_stops.csv",
    "driver_standings.csv", "constructor_standings.csv"
]

missing_files = [file for file in dataset_files if not os.path.exists(file)]
print("Missing files:", missing_files)

if len(missing_files) == 0:
    print("✅ All dataset files are present.")
else:
    print("⚠️ Some datasets are missing! Please check file names.")


Missing files: []
✅ All dataset files are present.


In [2]:

import pandas as pd

datasets = {}
for file in dataset_files:
    if os.path.exists(file):
        datasets[file.replace(".csv", "")] = pd.read_csv(file)
    else:
        print(f"⚠️ Warning: {file} not found!")

print("Loaded datasets:", datasets.keys())  # Check successfully loaded datasets


Loaded datasets: dict_keys(['drivers', 'constructors', 'races', 'results', 'qualifying', 'lap_times', 'pit_stops', 'driver_standings', 'constructor_standings'])


In [3]:

import pandas as pd

datasets = {}
for file in dataset_files:
    if os.path.exists(file):
        datasets[file.replace(".csv", "")] = pd.read_csv(file)
    else:
        print(f"⚠️ Warning: {file} not found!")

print("Loaded datasets:", datasets.keys())  # Check successfully loaded datasets


Loaded datasets: dict_keys(['drivers', 'constructors', 'races', 'results', 'qualifying', 'lap_times', 'pit_stops', 'driver_standings', 'constructor_standings'])


In [4]:
for name, df in datasets.items():
    print(f"\n{name} missing values:\n", df.isnull().sum())



drivers missing values:
 driverId       0
driverRef      0
number         0
code           0
forename       0
surname        0
dob            0
nationality    0
url            0
dtype: int64

constructors missing values:
 constructorId     0
constructorRef    0
name              0
nationality       0
url               0
dtype: int64

races missing values:
 raceId         0
year           0
round          0
circuitId      0
name           0
date           0
time           0
url            0
fp1_date       0
fp1_time       0
fp2_date       0
fp2_time       0
fp3_date       0
fp3_time       0
quali_date     0
quali_time     0
sprint_date    0
sprint_time    0
dtype: int64

results missing values:
 resultId           0
raceId             0
driverId           0
constructorId      0
number             0
grid               0
position           0
positionText       0
positionOrder      0
points             0
laps               0
time               0
milliseconds       0
fastestLap         0
r

In [5]:
for name, df in datasets.items():
    print(f"\n🔹 First 5 rows of {name}:")
    print(df.head())




🔹 First 5 rows of drivers:
   driverId   driverRef number code  forename     surname         dob  \
0         1    hamilton     44  HAM     Lewis    Hamilton  1985-01-07   
1         2    heidfeld     \N  HEI      Nick    Heidfeld  1977-05-10   
2         3     rosberg      6  ROS      Nico     Rosberg  1985-06-27   
3         4      alonso     14  ALO  Fernando      Alonso  1981-07-29   
4         5  kovalainen     \N  KOV    Heikki  Kovalainen  1981-10-19   

  nationality                                             url  
0     British     http://en.wikipedia.org/wiki/Lewis_Hamilton  
1      German      http://en.wikipedia.org/wiki/Nick_Heidfeld  
2      German       http://en.wikipedia.org/wiki/Nico_Rosberg  
3     Spanish    http://en.wikipedia.org/wiki/Fernando_Alonso  
4     Finnish  http://en.wikipedia.org/wiki/Heikki_Kovalainen  

🔹 First 5 rows of constructors:
   constructorId constructorRef        name nationality  \
0              1        mclaren     McLaren     British  

In [6]:
for name, df in datasets.items():
    print(f"\n🔹 Missing values in {name}:")
    print(df.isnull().sum())



🔹 Missing values in drivers:
driverId       0
driverRef      0
number         0
code           0
forename       0
surname        0
dob            0
nationality    0
url            0
dtype: int64

🔹 Missing values in constructors:
constructorId     0
constructorRef    0
name              0
nationality       0
url               0
dtype: int64

🔹 Missing values in races:
raceId         0
year           0
round          0
circuitId      0
name           0
date           0
time           0
url            0
fp1_date       0
fp1_time       0
fp2_date       0
fp2_time       0
fp3_date       0
fp3_time       0
quali_date     0
quali_time     0
sprint_date    0
sprint_time    0
dtype: int64

🔹 Missing values in results:
resultId           0
raceId             0
driverId           0
constructorId      0
number             0
grid               0
position           0
positionText       0
positionOrder      0
points             0
laps               0
time               0
milliseconds       0
fastes

In [7]:
for name, df in datasets.items():
    # Fill missing numerical values with the median
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    # Fill missing categorical values with the most frequent value
    cat_cols = df.select_dtypes(include=['object']).columns
    df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

    print(f"Missing values after handling in {name}:\n", df.isnull().sum(), "\n")


Missing values after handling in drivers:
 driverId       0
driverRef      0
number         0
code           0
forename       0
surname        0
dob            0
nationality    0
url            0
dtype: int64 

Missing values after handling in constructors:
 constructorId     0
constructorRef    0
name              0
nationality       0
url               0
dtype: int64 

Missing values after handling in races:
 raceId         0
year           0
round          0
circuitId      0
name           0
date           0
time           0
url            0
fp1_date       0
fp1_time       0
fp2_date       0
fp2_time       0
fp3_date       0
fp3_time       0
quali_date     0
quali_time     0
sprint_date    0
sprint_time    0
dtype: int64 

Missing values after handling in results:
 resultId           0
raceId             0
driverId           0
constructorId      0
number             0
grid               0
position           0
positionText       0
positionOrder      0
points             0
laps       

In [8]:
for name, df in datasets.items():
    print(f"\n🔹 Data types in {name}:")
    print(df.dtypes)



🔹 Data types in drivers:
driverId        int64
driverRef      object
number         object
code           object
forename       object
surname        object
dob            object
nationality    object
url            object
dtype: object

🔹 Data types in constructors:
constructorId      int64
constructorRef    object
name              object
nationality       object
url               object
dtype: object

🔹 Data types in races:
raceId          int64
year            int64
round           int64
circuitId       int64
name           object
date           object
time           object
url            object
fp1_date       object
fp1_time       object
fp2_date       object
fp2_time       object
fp3_date       object
fp3_time       object
quali_date     object
quali_time     object
sprint_date    object
sprint_time    object
dtype: object

🔹 Data types in results:
resultId             int64
raceId               int64
driverId             int64
constructorId        int64
number              obje

In [9]:
from sklearn.preprocessing import LabelEncoder

for name, df in datasets.items():
    cat_cols = df.select_dtypes(include=['object']).columns
    encoder = LabelEncoder()
    
    for col in cat_cols:
        df[col] = encoder.fit_transform(df[col])  # Convert categories to numbers

    print(f"Converted categorical columns in {name}:\n", df.head())


Converted categorical columns in drivers:
    driverId  driverRef  number  code  forename  surname  dob  nationality  url
0         1        345      28    33       268      307  784            9  526
1         2        361      48    35       326      321  751           20  615
2         3        688      35    69       327      604  788           20  617
3         4         20       4     3       130       16  770           37  259
4         5        438      48    41       175      384  772           18  347
Converted categorical columns in constructors:
    constructorId  constructorRef  name  nationality  url
0              1             132   135            5   99
1              2              18    17           10   17
2              3             209   209            5  173
3              4             167   168            9  131
4              5             197   198           14  140
Converted categorical columns in races:
    raceId  year  round  circuitId  name  date  time 

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
for name, df in datasets.items():
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    df[num_cols] = scaler.fit_transform(df[num_cols])

    print(f"Scaled numerical features in {name}:\n", df.head())


Scaled numerical features in drivers:
    driverId  driverRef  number  code  forename  surname  dob  nationality  url
0 -1.729591        345      28    33       268      307  784            9  526
1 -1.725569        361      48    35       326      321  751           20  615
2 -1.721548        688      35    69       327      604  788           20  617
3 -1.717526         20       4     3       130       16  770           37  259
4 -1.713504        438      48    41       175      384  772           18  347
Scaled numerical features in constructors:
    constructorId  constructorRef  name  nationality  url
0      -1.723886             132   135            5   99
1      -1.707706              18    17           10   17
2      -1.691527             209   209            5  173
3      -1.675347             167   168            9  131
4      -1.659168             197   198           14  140
Scaled numerical features in races:
      raceId      year     round  circuitId  name  date  time  ur

In [11]:
# Select dataset for training
df_main = datasets["races"]  # Replace with the appropriate dataset

# Define features and target
X = df_main.drop(columns=["raceId"])  # Drop irrelevant or ID columns
y = df_main["raceId"]  # Example target column (change based on your need)


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Training data shape: (900, 17)
Testing data shape: (225, 17)


In [13]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import keras_tuner as kt  # Ensure keras_tuner is installed



In [14]:
def build_model(hp):
    model = Sequential()
    model.add(Dense(hp.Int('units', min_value=32, max_value=128, step=32),
                    activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dropout(hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    return model


In [15]:
import keras_tuner as kt

def build_model(hp):
    model = Sequential()
    model.add(Dense(hp.Int('units', min_value=32, max_value=128, step=32),
                    activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dropout(hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    return model

tuner = kt.RandomSearch(build_model, objective='val_loss', max_trials=5)
tuner.search(X_train, y_train, epochs=20, validation_data=(X_test, y_test))

best_model = tuner.get_best_models(num_models=1)[0]


Reloading Tuner from .\untitled_project\tuner0.json



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


In [16]:
# Evaluate the model
test_loss, test_mae = best_model.evaluate(X_test, y_test)
print(f"Test MAE: {test_mae}")

# Make predictions
predictions = best_model.predict(X_test)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - loss: 0.6135 - mae: 0.5465
Test MAE: 0.592371940612793
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


In [17]:
import keras.saving

keras.saving.save_model(best_model, "best_model.keras")


In [18]:
from tensorflow import keras

# Load the saved model
best_model = keras.models.load_model("best_model.keras")

# Use it to make predictions (replace X_test with your actual test data)
predictions = best_model.predict(X_test)
print(predictions)


  saveable.load_own_variables(weights_store.get(inner_path))


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[[ 2.4492526 ]
 [ 3.244452  ]
 [-0.09496294]
 [-1.3533511 ]
 [ 1.4525199 ]
 [-0.52062535]
 [ 0.6931597 ]
 [-0.47261   ]
 [ 0.76637316]
 [ 1.0657554 ]
 [-0.5850694 ]
 [ 0.4444312 ]
 [ 0.83868456]
 [-0.6604643 ]
 [-0.5684004 ]
 [-0.47465825]
 [-1.2070894 ]
 [-0.75200605]
 [-0.04114651]
 [-0.80244493]
 [-0.4674263 ]
 [ 0.01184632]
 [ 0.41864598]
 [-0.03992938]
 [-0.6530856 ]
 [-0.6703799 ]
 [-0.6942265 ]
 [ 0.13989307]
 [-1.156313  ]
 [ 0.92002916]
 [-0.6119505 ]
 [-0.9538975 ]
 [-0.40466452]
 [-0.80304503]
 [-0.21906708]
 [-0.41783237]
 [-0.6628902 ]
 [-0.470222  ]
 [ 0.3766768 ]
 [-0.11988209]
 [ 0.53124046]
 [-0.6286199 ]
 [ 0.27290392]
 [ 0.64331055]
 [ 0.4828887 ]
 [ 0.41444778]
 [ 0.05020286]
 [-0.67709255]
 [-0.25392246]
 [-0.1286845 ]
 [ 0.4435966 ]
 [ 0.6679722 ]
 [ 0.31709146]
 [-0.7079158 ]
 [ 1.9953618 ]
 [-0.48872066]
 [-0.30028915]
 [-0.571646  ]
 [ 0.1263646 ]
 [ 0.04191257]
 [-0.34890127]
 [-0.77659965

In [19]:
test_loss, test_acc = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.6135 - mae: 0.5465 
Test Accuracy: 0.5924


In [20]:
import tensorflow as tf
converter = tf.lite.TFLiteConverter.from_keras_model(best_model)
tflite_model = converter.convert()

with open("best_model.tflite", "wb") as f:
    f.write(tflite_model)


INFO:tensorflow:Assets written to: C:\Users\SIBIDEV\AppData\Local\Temp\tmpn7jnce3s\assets


INFO:tensorflow:Assets written to: C:\Users\SIBIDEV\AppData\Local\Temp\tmpn7jnce3s\assets


Saved artifact at 'C:\Users\SIBIDEV\AppData\Local\Temp\tmpn7jnce3s'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 17), dtype=tf.float32, name='input_layer')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  2328894107920: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2328894102160: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2328894101392: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2328894103888: TensorSpec(shape=(), dtype=tf.resource, name=None)


In [21]:
import os
import pandas as pd

# Load datasets
dataset_files = [
    "drivers.csv", "constructors.csv", "races.csv", "results.csv",
    "qualifying.csv", "lap_times.csv", "pit_stops.csv",
    "driver_standings.csv", "constructor_standings.csv"
]

datasets = {}
for file in dataset_files:
    if os.path.exists(file):
        datasets[file.replace(".csv", "")] = pd.read_csv(file)
    else:
        print(f"⚠️ Warning: {file} not found!")

print("\n✅ Loaded datasets:", datasets.keys())

# Example: Extract feature names from the main dataset (modify as needed)
df_main = datasets.get("results")  # Change to the correct dataset used for training

if df_main is not None:
    print("\n🔹 Feature Order Used for Model Training:")
    print(df_main.columns.tolist())  # List all column names
else:
    print("\n⚠️ No main dataset found! Check dataset names.")



✅ Loaded datasets: dict_keys(['drivers', 'constructors', 'races', 'results', 'qualifying', 'lap_times', 'pit_stops', 'driver_standings', 'constructor_standings'])

🔹 Feature Order Used for Model Training:
['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid', 'position', 'positionText', 'positionOrder', 'points', 'laps', 'time', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed', 'statusId']


In [22]:
from flask import Flask, request, jsonify
import keras.saving
from keras.models import load_model
import numpy as np

# Load the trained model
model = load_model("best_model.keras")



  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Get data from request
        data = request.get_json()
        features = np.array(data["features"]).reshape(1, -1)  # Reshape for model input

        # Make prediction
        prediction = model.predict(features)

        # Convert prediction to JSON format
        response = {"prediction": prediction.tolist()}
        return jsonify(response)

    except Exception as e:
        return jsonify({"error": str(e)})

# Run the Flask app
if __name__ == '__main__':
   app.run(debug=True, use_reloader=False)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [04/Jul/2025 21:11:22] "[33mGET / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [04/Jul/2025 21:11:22] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
