<a href="https://colab.research.google.com/github/PrabhatGhm7/F1-Prediction/blob/main/F1_Imputer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('/content/BeforeImpute.csv')

In [5]:
df.drop('Stops',inplace=True,axis =1)

In [6]:
df.head()

Unnamed: 0,Year,Track,Country,Position,Driver,Team,Starting Grid,Points,Pitstop Time,Weather
0,2001,Albert Park Grand Prix Circuit,Australia,1,Michael Schumacher,Ferrari,1,10.0,28.717,
1,2001,Albert Park Grand Prix Circuit,Australia,2,David Coulthard,McLaren,6,6.0,28.516,
2,2001,Albert Park Grand Prix Circuit,Australia,3,Rubens Barrichello,Ferrari,2,4.0,28.658,
3,2001,Albert Park Grand Prix Circuit,Australia,4,Nick Heidfeld,Sauber,10,3.0,28.247,
4,2001,Albert Park Grand Prix Circuit,Australia,5,Heinz-Harald Frentzen,Jordan,4,2.0,28.142,


In [7]:
df.describe()

Unnamed: 0,Year,Position,Starting Grid,Points,Pitstop Time
count,16059.0,16059.0,16059.0,16059.0,11271.0
mean,2012.584096,10.686531,10.823899,3.881935,44.013408
std,6.503873,5.978039,6.259729,6.253735,24.050892
min,2001.0,1.0,0.0,0.0,13.895
25%,2007.0,6.0,5.0,0.0,24.714
50%,2013.0,11.0,11.0,0.0,40.299
75%,2018.0,16.0,16.0,6.0,55.914
max,2023.0,24.0,24.0,50.0,247.1


In [8]:
def sanity_check(dataframe):
  print("******************Shape************************")
  print(dataframe.shape)

  print("******************Info************************")
  print(dataframe.info())

  print("********************null**********************")
  print(dataframe.isnull().sum())

  print("********************duplicate**********************")
  print(dataframe.duplicated().sum())


In [9]:
sanity_check(df)

******************Shape************************
(16059, 10)
******************Info************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16059 entries, 0 to 16058
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Year           16059 non-null  int64  
 1   Track          16059 non-null  object 
 2   Country        16059 non-null  object 
 3   Position       16059 non-null  int64  
 4   Driver         16059 non-null  object 
 5   Team           16059 non-null  object 
 6   Starting Grid  16059 non-null  int64  
 7   Points         16059 non-null  float64
 8   Pitstop Time   11271 non-null  float64
 9   Weather        3277 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 1.2+ MB
None
********************null**********************
Year                 0
Track                0
Country              0
Position             0
Driver               0
Team                 0
Star

In [10]:
missing_percentage = (df['Pitstop Time'].isna().sum() / len(df)) * 100
print(f"Missing Percentage: {missing_percentage:.2f}%")


Missing Percentage: 29.82%


In [11]:
df.loc[df['Year'] == 2023, 'Pitstop Time'] = df.groupby('Team')['Pitstop Time'].transform(lambda x: x.fillna(x.median()))

In [12]:
from sklearn.impute import KNNImputer
import numpy as np

imputer = KNNImputer(n_neighbors=5)
df[['Pitstop Time']] = imputer.fit_transform(df[['Pitstop Time']])


In [13]:
df['Pitstop Time'].fillna(df['Pitstop Time'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Pitstop Time'].fillna(df['Pitstop Time'].median(), inplace=True)


In [14]:
weather_mode_by_track = (
    df[df['Year'] >= 2020]
    .groupby('Track')['Weather']
    .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
)

# Get the overall weather mode from the available 2020-2023 data
overall_weather_mode = df[df['Year'] >= 2020]['Weather'].mode().iloc[0]

def impute_weather(row):
    # If Weather is already present, return it
    if pd.notna(row['Weather']):
        return row['Weather']

    if row['Year'] < 2020:
        return weather_mode_by_track.get(row['Track'], overall_weather_mode)
    return row['Weather']

df['Weather'] = df.apply(impute_weather, axis=1)


In [15]:
print("Missing Weather values:", df['Weather'].isna().sum())

Missing Weather values: 0


In [16]:
df = df.drop_duplicates(
    subset=['Year','Track','Country','Position','Driver','Team','Starting Grid','Points','Weather'],
    keep='first'
)

In [17]:
df.duplicated().sum()

0

In [18]:
from google.colab import files

df.to_csv('AfterImpute.csv', index=False)

files.download('AfterImpute.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>