In [3]:
import pandas as pd

# 1. Load the dataset
file_path = 'DPL_Datasets/circuits.csv'
circuits = pd.read_csv(file_path)

# 2. Inspect the dataset
print("Initial Data Head:")
print(circuits.head())
print("\nData Info:")
print(circuits.info())
print("\nMissing Values:")
print(circuits.isnull().sum())

# 3. Convert numeric columns to proper data types
# 'lat', 'lng', and 'alt' should be numeric.
circuits['lat'] = pd.to_numeric(circuits['lat'], errors='coerce')
circuits['lng'] = pd.to_numeric(circuits['lng'], errors='coerce')
circuits['alt'] = pd.to_numeric(circuits['alt'], errors='coerce')

# 4. Clean string columns by stripping whitespace
str_cols = ['circuitRef', 'name', 'location', 'country', 'url']
for col in str_cols:
    circuits[col] = circuits[col].str.strip()

# 5. Check for duplicates and remove them if present
initial_count = circuits.shape[0]
circuits.drop_duplicates(inplace=True)
final_count = circuits.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 6. Verify the cleaning steps
print("\nCleaned Data Head:")
print(circuits.head())
print("\nCleaned Data Info:")
print(circuits.info())
print("\nMissing Values After Cleaning:")
print(circuits.isnull().sum())

# 7. Save the cleaned dataset to a new CSV file
clean_file_path = 'Cleaned_Dataset/circuits_clean.csv'
circuits.to_csv(clean_file_path, index=False)
print(f"\nCleaned dataset saved to {clean_file_path}")


Initial Data Head:
   circuitId   circuitRef                            name      location  \
0          1  albert_park  Albert Park Grand Prix Circuit     Melbourne   
1          2       sepang    Sepang International Circuit  Kuala Lumpur   
2          3      bahrain   Bahrain International Circuit        Sakhir   
3          4    catalunya  Circuit de Barcelona-Catalunya      Montmeló   
4          5     istanbul                   Istanbul Park      Istanbul   

     country       lat        lng  alt  \
0  Australia -37.84970  144.96800   10   
1   Malaysia   2.76083  101.73800   18   
2    Bahrain  26.03250   50.51060    7   
3      Spain  41.57000    2.26111  109   
4     Turkey  40.95170   29.40500  130   

                                                 url  
0  http://en.wikipedia.org/wiki/Melbourne_Grand_P...  
1  http://en.wikipedia.org/wiki/Sepang_Internatio...  
2  http://en.wikipedia.org/wiki/Bahrain_Internati...  
3  http://en.wikipedia.org/wiki/Circuit_de_Barcel...  
4 

In [6]:
import pandas as pd
import numpy as np

# 1. Load the dataset, treating "\N" as a missing value
file_path = r'DPL_Datasets\\constructor_results.csv'
df = pd.read_csv(file_path, na_values=["\\N"])

# 2. Inspect the dataset
print("Initial Data:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# 3. Convert numeric columns to proper types (if not already)
numeric_cols = ['constructorResultsId', 'raceId', 'constructorId', 'points']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 4. Clean the 'status' column
# Here, since "\N" was replaced with NaN, we can leave it as missing.
df['status'] = df['status'].str.strip()  # Remove any extra whitespace if present

# 5. Remove duplicate rows (if any)
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

df['status'] = df['status'].fillna('Unknown')

# Verify that the missing values have been handled
print("\nMissing Values After Filling 'status':")
print(df.isnull().sum())

# 6. Final inspection
print("\nCleaned Data:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# 7. Save the cleaned dataset to a new CSV file
clean_file_path = r'Cleaned_Dataset\constructor_results_clean.csv'
df.to_csv(clean_file_path, index=False)
print(f"\nCleaned dataset saved to {clean_file_path}")


Initial Data:
   constructorResultsId  raceId  constructorId  points status
0                     1      18              1    14.0    NaN
1                     2      18              2     8.0    NaN
2                     3      18              3     9.0    NaN
3                     4      18              4     5.0    NaN
4                     5      18              5     2.0    NaN

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12625 entries, 0 to 12624
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   constructorResultsId  12625 non-null  int64  
 1   raceId                12625 non-null  int64  
 2   constructorId         12625 non-null  int64  
 3   points                12625 non-null  float64
 4   status                17 non-null     object 
dtypes: float64(1), int64(3), object(1)
memory usage: 493.3+ KB
None

Missing Values:
constructorResultsId        0
raceId               

In [7]:
import pandas as pd

# 1. Load the dataset from DPL_Datasets directory
file_path = r'DPL_Datasets\constructor_standings.csv'
df = pd.read_csv(file_path)

# 2. Inspect the initial data
print("Initial Data Head:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# 3. Convert numeric columns to proper types
numeric_cols = ['constructorStandingsId', 'raceId', 'constructorId', 'points', 'position', 'wins']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 4. Clean string columns by stripping whitespace
df['positionText'] = df['positionText'].str.strip()

# 5. Remove duplicate rows (if any)
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 6. Final inspection after cleaning
print("\nCleaned Data Head:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# 7. Save the cleaned dataset into the Cleaned_Dataset directory
output_path = r'Cleaned_Dataset\constructor_standings_clean.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Initial Data Head:
   constructorStandingsId  raceId  constructorId  points  position  \
0                       1      18              1    14.0         1   
1                       2      18              2     8.0         3   
2                       3      18              3     9.0         2   
3                       4      18              4     5.0         4   
4                       5      18              5     2.0         5   

  positionText  wins  
0            1     1  
1            3     0  
2            2     0  
3            4     0  
4            5     0  

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13391 entries, 0 to 13390
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   constructorStandingsId  13391 non-null  int64  
 1   raceId                  13391 non-null  int64  
 2   constructorId           13391 non-null  int64  
 3   points                  13391 no

In [8]:
import pandas as pd

# 1. Load the dataset
file_path = r'DPL_Datasets\constructors.csv'
df = pd.read_csv(file_path)

# 2. Inspect the initial data
print("Initial Data Head:")
print(df.head())
print("\nData Info:")
print(df.info())

# 3. Convert 'constructorId' to numeric
df['constructorId'] = pd.to_numeric(df['constructorId'], errors='coerce')

# 4. Clean string columns by stripping whitespace
str_cols = ['constructorRef', 'name', 'nationality', 'url']
for col in str_cols:
    df[col] = df[col].str.strip()

# 5. Remove duplicate rows (if any)
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 6. Final inspection after cleaning
print("\nCleaned Data Head:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())

# 7. Save the cleaned dataset to the Cleaned_Dataset directory
output_path = r'Cleaned_Dataset\constructors_clean.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Initial Data Head:
   constructorId constructorRef        name nationality  \
0              1        mclaren     McLaren     British   
1              2     bmw_sauber  BMW Sauber      German   
2              3       williams    Williams     British   
3              4        renault     Renault      French   
4              5     toro_rosso  Toro Rosso     Italian   

                                                 url  
0               http://en.wikipedia.org/wiki/McLaren  
1            http://en.wikipedia.org/wiki/BMW_Sauber  
2  http://en.wikipedia.org/wiki/Williams_Grand_Pr...  
3  http://en.wikipedia.org/wiki/Renault_in_Formul...  
4   http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso  

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212 entries, 0 to 211
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   constructorId   212 non-null    int64 
 1   constructorRef  212 non-null    object
 2  

In [9]:
import pandas as pd

# 1. Load the dataset from DPL_Datasets directory
file_path = r'DPL_Datasets\driver_standings.csv'
df = pd.read_csv(file_path)

# 2. Inspect the dataset
print("Initial Data Head:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# 3. Convert columns to appropriate data types
numeric_cols = ['driverStandingsId', 'raceId', 'driverId', 'points', 'position', 'wins']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 4. Clean string columns: Remove extra whitespace in 'positionText'
df['positionText'] = df['positionText'].str.strip()

# 5. Remove duplicate rows if any
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 6. Final inspection after cleaning
print("\nCleaned Data Head:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# 7. Save the cleaned dataset to the Cleaned_Dataset directory
output_path = r'Cleaned_Dataset\driver_standings_clean.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Initial Data Head:
   driverStandingsId  raceId  driverId  points  position positionText  wins
0                  1      18         1    10.0         1            1     1
1                  2      18         2     8.0         2            2     0
2                  3      18         3     6.0         3            3     0
3                  4      18         4     5.0         4            4     0
4                  5      18         5     4.0         5            5     0

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34863 entries, 0 to 34862
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   driverStandingsId  34863 non-null  int64  
 1   raceId             34863 non-null  int64  
 2   driverId           34863 non-null  int64  
 3   points             34863 non-null  float64
 4   position           34863 non-null  int64  
 5   positionText       34863 non-null  object 
 6   wins            

In [10]:
import pandas as pd

# 1. Load the dataset, treating "\N" as a missing value
file_path = r'DPL_Datasets\drivers.csv'
df = pd.read_csv(file_path, na_values=["\\N"])

# 2. Inspect the initial data
print("Initial Data Head:")
print(df.head())
print("\nData Info:")
print(df.info())

# 3. Convert columns to proper data types
# Convert 'driverId' and 'number' to numeric
numeric_cols = ['driverId', 'number']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Convert 'dob' to datetime format
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

# 4. Clean string columns by stripping extra whitespace
str_cols = ['driverRef', 'code', 'forename', 'surname', 'nationality', 'url']
for col in str_cols:
    df[col] = df[col].str.strip()

# 5. Remove duplicate rows (if any)
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 6. Final inspection after cleaning
print("\nCleaned Data Head:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())

# 7. Save the cleaned dataset into the Cleaned_Dataset directory
output_path = r'Cleaned_Dataset\drivers_clean.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Initial Data Head:
   driverId   driverRef  number code  forename     surname         dob  \
0         1    hamilton    44.0  HAM     Lewis    Hamilton  1985-01-07   
1         2    heidfeld     NaN  HEI      Nick    Heidfeld  1977-05-10   
2         3     rosberg     6.0  ROS      Nico     Rosberg  1985-06-27   
3         4      alonso    14.0  ALO  Fernando      Alonso  1981-07-29   
4         5  kovalainen     NaN  KOV    Heikki  Kovalainen  1981-10-19   

  nationality                                             url  
0     British     http://en.wikipedia.org/wiki/Lewis_Hamilton  
1      German      http://en.wikipedia.org/wiki/Nick_Heidfeld  
2      German       http://en.wikipedia.org/wiki/Nico_Rosberg  
3     Spanish    http://en.wikipedia.org/wiki/Fernando_Alonso  
4     Finnish  http://en.wikipedia.org/wiki/Heikki_Kovalainen  

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 861 entries, 0 to 860
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dt

In [11]:
import pandas as pd

# 1. Load the dataset from the DPL_Datasets directory.
file_path = r'DPL_Datasets\lap_times.csv'
df = pd.read_csv(file_path)

# 2. Inspect the initial data.
print("Initial Data Head:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# 3. Convert numeric columns to proper types.
numeric_cols = ['raceId', 'driverId', 'lap', 'position', 'milliseconds']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 4. Convert the 'time' column into a timedelta.
# The 'time' values are in "m:ss.xxx" format. We prepend "0:" to convert it to "h:mm:ss.xxx" so that pd.to_timedelta can parse it.
df['time_timedelta'] = pd.to_timedelta("0:" + df['time'], errors='coerce')

# Optional: Create a column with total seconds for easier numerical analysis.
df['time_seconds'] = df['time_timedelta'].dt.total_seconds()

# 5. Remove duplicate rows if any.
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 6. Final inspection after cleaning.
print("\nCleaned Data Head:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# 7. Save the cleaned dataset into the Cleaned_Dataset directory.
output_path = r'Cleaned_Dataset\lap_times_clean.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Initial Data Head:
   raceId  driverId  lap  position      time  milliseconds
0     841        20    1         1  1:38.109         98109
1     841        20    2         1  1:33.006         93006
2     841        20    3         1  1:32.713         92713
3     841        20    4         1  1:32.803         92803
4     841        20    5         1  1:32.342         92342

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589081 entries, 0 to 589080
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   raceId        589081 non-null  int64 
 1   driverId      589081 non-null  int64 
 2   lap           589081 non-null  int64 
 3   position      589081 non-null  int64 
 4   time          589081 non-null  object
 5   milliseconds  589081 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 27.0+ MB
None

Missing Values:
raceId          0
driverId        0
lap             0
position        0
time            

In [12]:
import pandas as pd

# 1. Load the dataset from DPL_Datasets directory.
file_path = r'DPL_Datasets\pit_stops.csv'
df = pd.read_csv(file_path)

# 2. Inspect the initial data.
print("Initial Data Head:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# 3. Convert numeric columns to proper types.
numeric_cols = ['raceId', 'driverId', 'stop', 'lap', 'milliseconds']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 4. Convert 'duration' to a numeric type (float) representing seconds.
df['duration'] = pd.to_numeric(df['duration'], errors='coerce')

# 5. Convert the 'time' column to a datetime object.
# Since 'time' is provided in "HH:MM:SS" format, we convert it using pd.to_datetime.
# We'll extract the time part afterwards.
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S', errors='coerce').dt.time

# 6. Remove duplicate rows if any.
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 7. Final inspection after cleaning.
print("\nCleaned Data Head:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# 8. Save the cleaned dataset into the Cleaned_Dataset directory.
output_path = r'Cleaned_Dataset\pit_stops_clean.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Initial Data Head:
   raceId  driverId  stop  lap      time duration  milliseconds
0     841       153     1    1  17:05:23   26.898         26898
1     841        30     1    1  17:05:52   25.021         25021
2     841        17     1   11  17:20:48   23.426         23426
3     841         4     1   12  17:22:34   23.251         23251
4     841        13     1   13  17:24:10   23.842         23842

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11371 entries, 0 to 11370
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   raceId        11371 non-null  int64 
 1   driverId      11371 non-null  int64 
 2   stop          11371 non-null  int64 
 3   lap           11371 non-null  int64 
 4   time          11371 non-null  object
 5   duration      11371 non-null  object
 6   milliseconds  11371 non-null  int64 
dtypes: int64(5), object(2)
memory usage: 622.0+ KB
None

Missing Values:
raceId          0
driverI

In [13]:
import pandas as pd

# 1. Load the dataset, treating "\N" as a missing value.
file_path = r'DPL_Datasets\qualifying.csv'
df = pd.read_csv(file_path, na_values=["\\N"])

# 2. Inspect the initial data.
print("Initial Data Head:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# 3. Convert key columns to proper numeric types.
numeric_cols = ['qualifyId', 'raceId', 'driverId', 'constructorId', 'number', 'position']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 4. Convert qualifying time columns to timedelta and seconds.
# The qualifying times are in "m:ss.xxx" format.
for col in ['q1', 'q2', 'q3']:
    # Prepend "0:" so that "1:26.572" becomes "0:1:26.572" (interpreted as h:mm:ss.xxx).
    df[col + '_timedelta'] = pd.to_timedelta("0:" + df[col].astype(str), errors='coerce')
    # Create a column with total seconds.
    df[col + '_seconds'] = df[col + '_timedelta'].dt.total_seconds()

# 5. Remove duplicate rows if any.
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 6. Final inspection after cleaning.
print("\nCleaned Data Head:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# 7. Save the cleaned dataset into the Cleaned_Dataset directory.
output_path = r'Cleaned_Dataset\qualifying_clean.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Initial Data Head:
   qualifyId  raceId  driverId  constructorId  number  position        q1  \
0          1      18         1              1      22         1  1:26.572   
1          2      18         9              2       4         2  1:26.103   
2          3      18         5              1      23         3  1:25.664   
3          4      18        13              6       2         4  1:25.994   
4          5      18         2              2       3         5  1:25.960   

         q2        q3  
0  1:25.187  1:26.714  
1  1:25.315  1:26.869  
2  1:25.452  1:27.079  
3  1:25.691  1:27.178  
4  1:25.518  1:27.236  

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10494 entries, 0 to 10493
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   qualifyId      10494 non-null  int64 
 1   raceId         10494 non-null  int64 
 2   driverId       10494 non-null  int64 
 3   constructorId  10494 non-null  int

In [14]:
import pandas as pd

# 1. Load the dataset, treating "\N" as missing values.
file_path = r'DPL_Datasets\races.csv'
df = pd.read_csv(file_path, na_values=["\\N"])

# 2. Inspect the initial data.
print("Initial Data Head:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# 3. Convert date columns to datetime.
date_columns = ['date', 'fp1_date', 'fp2_date', 'fp3_date', 'quali_date', 'sprint_date']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# 4. Convert time columns to proper time objects.
# The time format is "HH:MM:SS". We convert each using pd.to_datetime and extract the .time() component.
time_columns = ['time', 'fp1_time', 'fp2_time', 'fp3_time', 'quali_time', 'sprint_time']
for col in time_columns:
    # If the value is missing, it will become NaT; then we extract the time.
    df[col] = pd.to_datetime(df[col], format='%H:%M:%S', errors='coerce').dt.time

# 5. Remove duplicate rows if any.
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 6. Final inspection after cleaning.
print("\nCleaned Data Head:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# 7. Save the cleaned dataset into the Cleaned_Dataset directory.
output_path = r'Cleaned_Dataset\races_clean.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Initial Data Head:
   raceId  year  round  circuitId                   name        date  \
0       1  2009      1          1  Australian Grand Prix  2009-03-29   
1       2  2009      2          2   Malaysian Grand Prix  2009-04-05   
2       3  2009      3         17     Chinese Grand Prix  2009-04-19   
3       4  2009      4          3     Bahrain Grand Prix  2009-04-26   
4       5  2009      5          4     Spanish Grand Prix  2009-05-10   

       time                                                url fp1_date  \
0  06:00:00  http://en.wikipedia.org/wiki/2009_Australian_G...      NaN   
1  09:00:00  http://en.wikipedia.org/wiki/2009_Malaysian_Gr...      NaN   
2  07:00:00  http://en.wikipedia.org/wiki/2009_Chinese_Gran...      NaN   
3  12:00:00  http://en.wikipedia.org/wiki/2009_Bahrain_Gran...      NaN   
4  12:00:00  http://en.wikipedia.org/wiki/2009_Spanish_Gran...      NaN   

  fp1_time fp2_date fp2_time fp3_date fp3_time quali_date quali_time  \
0      NaN      NaN      

In [18]:
import pandas as pd

# Helper function to parse the 'time' column.
def parse_result_time(time_str):
    """
    Parse the race 'time' field:
      - If the string starts with '+' (representing a gap), convert the following seconds into a timedelta.
      - Otherwise, attempt to convert the string to a timedelta.
    """
    if pd.isnull(time_str):
        return pd.NaT
    time_str = time_str.strip()
    if time_str.startswith('+'):
        try:
            seconds = float(time_str[1:])
            return pd.Timedelta(seconds=seconds)
        except Exception as e:
            return pd.NaT
    else:
        try:
            # This will handle full times like "1:34:50.616"
            return pd.to_timedelta(time_str)
        except Exception as e:
            return pd.NaT

# Helper function to parse the 'fastestLapTime' column.
def parse_fastest_lap_time(time_str):
    """
    Parse the fastest lap time (formatted as "m:ss.xxx"). If only one colon is present,
    prepend "0:" to convert it to "0:m:ss.xxx" so it can be parsed as a timedelta.
    """
    if pd.isnull(time_str):
        return pd.NaT
    time_str = time_str.strip()
    # If only minutes and seconds are provided, add an hour component.
    if time_str.count(':') == 1:
        time_str = "0:" + time_str
    try:
        return pd.to_timedelta(time_str)
    except Exception as e:
        return pd.NaT

# 1. Load the dataset, treating "\N" as missing values.
file_path = r'DPL_Datasets\results.csv'
df = pd.read_csv(file_path, na_values=["\\N"])

# 2. Inspect the initial data.
print("Initial Data Head:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# 3. Convert key numeric columns to proper types.
numeric_cols = [
    'resultId', 'raceId', 'driverId', 'constructorId', 'number',
    'grid', 'position', 'positionOrder', 'points', 'laps', 
    'milliseconds', 'fastestLap', 'rank', 'fastestLapSpeed', 'statusId'
]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 4. Process the 'time' column.
df['time_timedelta'] = df['time'].apply(parse_result_time)

# 5. Process the 'fastestLapTime' column.
df['fastestLapTime_timedelta'] = df['fastestLapTime'].apply(parse_fastest_lap_time)

# 6. Remove duplicate rows if any.
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 7. Final inspection after cleaning.
print("\nCleaned Data Head:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# 8. Save the cleaned dataset into the Cleaned_Dataset directory.
output_path = r'Cleaned_Dataset\results_clean.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Initial Data Head:
   resultId  raceId  driverId  constructorId  number  grid  position  \
0         1      18         1              1    22.0     1       1.0   
1         2      18         2              2     3.0     5       2.0   
2         3      18         3              3     7.0     7       3.0   
3         4      18         4              4     5.0    11       4.0   
4         5      18         5              1    23.0     3       5.0   

  positionText  positionOrder  points  laps         time  milliseconds  \
0            1              1    10.0    58  1:34:50.616     5690616.0   
1            2              2     8.0    58       +5.478     5696094.0   
2            3              3     6.0    58       +8.163     5698779.0   
3            4              4     5.0    58      +17.181     5707797.0   
4            5              5     4.0    58      +18.014     5708630.0   

   fastestLap  rank fastestLapTime  fastestLapSpeed  statusId  
0        39.0   2.0       1:27.452     

In [20]:
import pandas as pd

# 1. Load the dataset from DPL_Datasets directory, treating "\N" as missing values.
file_path = r'DPL_Datasets\seasons.csv'
df = pd.read_csv(file_path, na_values=["\\N"])

# 2. Inspect the initial data.
print("Initial Data Head:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# 3. Convert the 'year' column to numeric.
df['year'] = pd.to_numeric(df['year'], errors='coerce')

# 4. Clean string columns by stripping extra whitespace.
df['url'] = df['url'].str.strip()

# 5. Remove duplicate rows if any.
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 6. Final inspection after cleaning.
print("\nCleaned Data Head:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# 7. Save the cleaned dataset to the Cleaned_Dataset directory.
output_path = r'Cleaned_Dataset\seasons_clean.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Initial Data Head:
   year                                                url
0  2009  http://en.wikipedia.org/wiki/2009_Formula_One_...
1  2008  http://en.wikipedia.org/wiki/2008_Formula_One_...
2  2007  http://en.wikipedia.org/wiki/2007_Formula_One_...
3  2006  http://en.wikipedia.org/wiki/2006_Formula_One_...
4  2005  http://en.wikipedia.org/wiki/2005_Formula_One_...

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   year    75 non-null     int64 
 1   url     75 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.3+ KB
None

Missing Values:
year    0
url     0
dtype: int64

Removed 0 duplicate rows.

Cleaned Data Head:
   year                                                url
0  2009  http://en.wikipedia.org/wiki/2009_Formula_One_...
1  2008  http://en.wikipedia.org/wiki/2008_Formula_One_...
2  2007  http://en.wikipedia.org/wiki/20

In [21]:
import pandas as pd

# Helper function to parse the 'time' column.
def parse_result_time(time_str):
    """
    Parse the race 'time' field:
      - If the string starts with '+' (a gap), convert the following seconds into a timedelta.
      - Otherwise, assume the format is either "m:ss.xxx" (for sprint race durations) or "h:mm:ss.xxx".
        If only one colon is present, prepend "0:" to interpret it as "0:m:ss.xxx".
    """
    if pd.isnull(time_str):
        return pd.NaT
    time_str = time_str.strip()
    if time_str.startswith('+'):
        try:
            seconds = float(time_str[1:])
            return pd.Timedelta(seconds=seconds)
        except Exception:
            return pd.NaT
    else:
        # Check number of colons: if only one colon, prepend "0:".
        if time_str.count(':') == 1:
            time_str = "0:" + time_str
        try:
            return pd.to_timedelta(time_str)
        except Exception:
            return pd.NaT

# Helper function to parse the 'fastestLapTime' column.
def parse_fastest_lap_time(time_str):
    """
    Parse the fastest lap time, which is in "m:ss.xxx" format.
    If only one colon is present, prepend "0:" to interpret it as "0:m:ss.xxx".
    """
    if pd.isnull(time_str):
        return pd.NaT
    time_str = time_str.strip()
    if time_str.count(':') == 1:
        time_str = "0:" + time_str
    try:
        return pd.to_timedelta(time_str)
    except Exception:
        return pd.NaT

# 1. Load the dataset from the DPL_Datasets directory, treating "\N" as missing values.
file_path = r'DPL_Datasets\sprint_results.csv'
df = pd.read_csv(file_path, na_values=["\\N"])

# 2. Inspect the initial data.
print("Initial Data Head:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# 3. Convert key numeric columns.
numeric_cols = [
    'resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid',
    'position', 'positionOrder', 'points', 'laps', 'milliseconds', 
    'fastestLap', 'statusId'
]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 4. Process the 'time' column.
# The 'time' column may be a full race time (e.g., "25:38.426") or a gap (e.g., "+1.430").
df['time_timedelta'] = df['time'].apply(parse_result_time)

# 5. Process the 'fastestLapTime' column.
df['fastestLapTime_timedelta'] = df['fastestLapTime'].apply(parse_fastest_lap_time)

# 6. Remove duplicate rows if any.
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 7. Final inspection after cleaning.
print("\nCleaned Data Head:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# 8. Save the cleaned dataset into the Cleaned_Dataset directory.
output_path = r'Cleaned_Dataset\sprint_results_clean.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Initial Data Head:
   resultId  raceId  driverId  constructorId  number  grid  position  \
0         1    1061       830              9      33     2       1.0   
1         2    1061         1            131      44     1       2.0   
2         3    1061       822            131      77     3       3.0   
3         4    1061       844              6      16     4       4.0   
4         5    1061       846              1       4     6       5.0   

  positionText  positionOrder  points  laps       time  milliseconds  \
0            1              1       3    17  25:38.426     1538426.0   
1            2              2       2    17     +1.430     1539856.0   
2            3              3       1    17     +7.502     1545928.0   
3            4              4       0    17    +11.278     1549704.0   
4            5              5       0    17    +24.111     1562537.0   

   fastestLap fastestLapTime  statusId  
0        14.0       1:30.013         1  
1        17.0       1:29.937     

In [22]:
import pandas as pd

# 1. Load the dataset, treating any "\N" as missing values (if needed).
file_path = r'DPL_Datasets\status.csv'
df = pd.read_csv(file_path, na_values=["\\N"])

# 2. Inspect the initial data.
print("Initial Data Head:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# 3. Convert 'statusId' to numeric.
df['statusId'] = pd.to_numeric(df['statusId'], errors='coerce')

# 4. Clean string columns: Remove extra whitespace in 'status'.
df['status'] = df['status'].str.strip()

# 5. Remove duplicate rows if any.
initial_count = df.shape[0]
df.drop_duplicates(inplace=True)
final_count = df.shape[0]
print(f"\nRemoved {initial_count - final_count} duplicate rows.")

# 6. Final inspection after cleaning.
print("\nCleaned Data Head:")
print(df.head())
print("\nCleaned Data Info:")
print(df.info())
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# 7. Save the cleaned dataset to the Cleaned_Dataset directory.
output_path = r'Cleaned_Dataset\status_clean.csv'
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved to {output_path}")


Initial Data Head:
   statusId        status
0         1      Finished
1         2  Disqualified
2         3      Accident
3         4     Collision
4         5        Engine

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139 entries, 0 to 138
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   statusId  139 non-null    int64 
 1   status    139 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.3+ KB
None

Missing Values:
statusId    0
status      0
dtype: int64

Removed 0 duplicate rows.

Cleaned Data Head:
   statusId        status
0         1      Finished
1         2  Disqualified
2         3      Accident
3         4     Collision
4         5        Engine

Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139 entries, 0 to 138
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   statusId  139 non-null    int64 
 1 