In [2]:
import pandas as pd

In [None]:
# Load your CSV file
df = pd.read_csv(r"csv file location")

In [4]:
# Print basic info about the dataframe
print("=== DataFrame Info ===")
print(df.info())
print("\n")

=== DataFrame Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1173 entries, 0 to 1172
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   type_of_document            1172 non-null   object
 1   year                        1158 non-null   object
 2   grand_prix                  1170 non-null   object
 3   description                 1170 non-null   object
 4   session_type                461 non-null    object
 5   track                       654 non-null    object
 6   lap_number                  18 non-null     object
 7   turn_number                 101 non-null    object
 8   safety_car_or_vsc_involved  22 non-null     object
 9   penalty_given               280 non-null    object
 10  type_of_incident            430 non-null    object
 11  was_contact_made            117 non-null    object
 12  immediat_advantage_gained   5 non-null      object
 13  drivers_invovled         

In [5]:
# Show first few rows
print("=== Sample Rows ===")
print(df.head())
print("\n")


=== Sample Rows ===
                                    type_of_document  \
0                                         Document 3   
1  {'Yuki Tsunoda': {'Brazil': 12, 'Canada': 8, '...   
2                          Technical delegate report   
3                        Technical Delegate's Report   
4                                  Stewards Decision   

                                                year  \
0                                               2022   
1  {'Oracle Red Bull': {'Brazil': 18, 'Canada': 1...   
2                                               2022   
3                                               2022   
4                                               2022   

                                          grand_prix  \
0                               Abu Dhabi Grand Prix   
1  2022 Abu Dhabi Grand Prix - Championship Point...   
2                               Abu Dhabi Grand Prix   
3                               Abu Dhabi Grand Prix   
4                         

In [6]:
# Check missing values per column
print("=== Missing Values Per Column ===")
print(df.isnull().sum())
print("\n")


=== Missing Values Per Column ===
type_of_document                 1
year                            15
grand_prix                       3
description                      3
session_type                   712
track                          519
lap_number                    1155
turn_number                   1072
safety_car_or_vsc_involved    1151
penalty_given                  893
type_of_incident               743
was_contact_made              1056
immediat_advantage_gained     1168
drivers_invovled               370
teams_invovled                 442
rule_violated                  586
decision_notes                 117
source_file                    221
dtype: int64




In [7]:
# Describe object columns: number of unique values and examples
print("=== Feature Details ===")
for col in df.columns:
    print(f"\n--- {col} ---")
    print(f"Type: {df[col].dtype}")
    num_unique = df[col].nunique(dropna=True)
    print(f"Unique values: {num_unique}")
    # Show example values
    examples = df[col].dropna().unique()[:5]
    print(f"Examples: {examples}")
    num_missing = df[col].isnull().sum()
    print(f"Missing values: {num_missing}")


=== Feature Details ===

--- type_of_document ---
Type: object
Unique values: 82
Examples: ['Document 3'
 "{'Yuki Tsunoda': {'Brazil': 12, 'Canada': 8, 'China': 11, 'France': 15, 'Hungary': 10, 'Italy': 17, 'Japan': 13, 'Mexico': 14, 'Qatar': 16}, 'Guanyu Zhou': {'Brazil': 6, 'Canada': 11, 'China': 15, 'France': 10, 'Hungary': 9, 'Italy': 8, 'Japan': 13, 'Mexico': 12, 'Qatar': 16}, 'Alex Albon': {'Brazil': 4, 'Canada': 14, 'China': 10, 'France': 11, 'Hungary': 8, 'Italy': 12, 'Japan': 5, 'Mexico': 13, 'Qatar': 15}, 'Nick Latifi': {'Brazil': 2, 'Canada': 16, 'China': 16, 'France': 14, 'Hungary': 15, 'Italy': 15, 'Japan': 17, 'Mexico': 18, 'Qatar': 19}, 'Nyck de Vries': {'Brazil': 9, 'Canada': 0, 'China': 12, 'France': 17, 'Hungary': 8, 'Italy': 9, 'Japan': 10, 'Mexico': 11, 'Qatar': 18}, 'Nico Hulkenberg': {'Brazil': 17, 'Canada': 12, 'China': 0, 'France': 0, 'Hungary': 0, 'Italy': 0, 'Japan': 0, 'Mexico': 0, 'Qatar': 0}, 'Yves Menard': {'Brazil': 0, 'Canada': 0, 'China': 0, 'France': 0

In [8]:
# Optional: List columns with a high proportion of missing data
print("\n=== Columns with >50% missing data ===")
missing_ratio = df.isnull().mean()
print(missing_ratio[missing_ratio > 0.5])



=== Columns with >50% missing data ===
session_type                  0.606991
lap_number                    0.984655
turn_number                   0.913896
safety_car_or_vsc_involved    0.981245
penalty_given                 0.761296
type_of_incident              0.633419
was_contact_made              0.900256
immediat_advantage_gained     0.995737
dtype: float64


In [9]:
# Optional: Save a summary table to CSV
summary = pd.DataFrame({
    'Type': df.dtypes,
    'Unique Values': df.nunique(),
    'Missing Values': df.isnull().sum(),
    'Missing %': df.isnull().mean() * 100
})
summary.to_csv("feature_summary.csv")

print("\nFeature summary saved to 'feature_summary.csv'")



Feature summary saved to 'feature_summary.csv'


In [None]:
penalty_df = df[
    df['penalty_given'].notnull() &
    df['description'].notnull() 
    
].copy()


print("\n=== Penalty DataFrame Info ===")
print(penalty_df.info())
print(penalty_df)
penalty_df.to_csv("penalty22_data.csv", index=False)


=== Penalty DataFrame Info ===
<class 'pandas.core.frame.DataFrame'>
Index: 280 entries, 9 to 1156
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   type_of_document            280 non-null    object
 1   year                        272 non-null    object
 2   grand_prix                  278 non-null    object
 3   description                 280 non-null    object
 4   session_type                215 non-null    object
 5   track                       94 non-null     object
 6   lap_number                  10 non-null     object
 7   turn_number                 57 non-null     object
 8   safety_car_or_vsc_involved  6 non-null      object
 9   penalty_given               280 non-null    object
 10  type_of_incident            225 non-null    object
 11  was_contact_made            50 non-null     object
 12  immediat_advantage_gained   5 non-null      object
 13  drivers_invovled      

: 

In [None]:
# Load your CSV file
df24 = pd.read_csv(r"file loc")

In [26]:
# Check missing values per column
print("=== Missing Values Per Column ===")
print(df24.isnull().sum())
print("\n")


=== Missing Values Per Column ===
type_of_document                 0
year                           104
grand_prix                       2
description                     34
session_type                   862
track                          671
lap_number                    1489
turn_number                   1387
safety_car_or_vsc_involved    1429
penalty_given                 1182
type_of_incident               956
was_contact_made              1389
immediate_advantage_gained    1498
drivers_involved               432
teams_involved                 496
rule_violated                  820
decision_notes                 238
source_file                    346
dtype: int64




In [31]:
penalty_df24 = df24[
    df24['penalty_given'].notnull() &
    df24['description'].notnull()
].copy()


print("\n=== Penalty DataFrame Info ===")
print(penalty_df24.info())
print(penalty_df24)
penalty_df24.to_csv("penalty24_data.csv", index=False)


=== Penalty DataFrame Info ===
<class 'pandas.core.frame.DataFrame'>
Index: 327 entries, 21 to 1482
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   type_of_document            327 non-null    object
 1   year                        301 non-null    object
 2   grand_prix                  327 non-null    object
 3   description                 327 non-null    object
 4   session_type                259 non-null    object
 5   track                       95 non-null     object
 6   lap_number                  11 non-null     object
 7   turn_number                 66 non-null     object
 8   safety_car_or_vsc_involved  24 non-null     object
 9   penalty_given               327 non-null    object
 10  type_of_incident            269 non-null    object
 11  was_contact_made            53 non-null     object
 12  immediate_advantage_gained  6 non-null      object
 13  drivers_involved     

In [None]:
# Load your CSV file
df25 = pd.read_csv(r"file location")

In [33]:
# Check missing values per column
print("=== Missing Values Per Column ===")
print(df25.isnull().sum())
print("\n")


=== Missing Values Per Column ===
type_of_document                0
year                           19
grand_prix                      3
description                    50
session_type                  426
track                         431
lap_number                    727
turn_number                   661
safety_car_or_vsc_involved    700
penalty_given                 579
type_of_incident              478
was_contact_made              681
immediate_advantage_gained    728
drivers_involved              249
teams_involved                251
rule_violated                 393
decision_notes                184
source_file                   147
dtype: int64




In [34]:
penalty_df25 = df25[
    df25['penalty_given'].notnull() &
    df25['description'].notnull()
].copy()


print("\n=== Penalty DataFrame Info ===")
print(penalty_df25.info())
print(penalty_df25)
penalty_df25.to_csv("penalty25_data.csv", index=False)


=== Penalty DataFrame Info ===
<class 'pandas.core.frame.DataFrame'>
Index: 168 entries, 2 to 732
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   type_of_document            168 non-null    object
 1   year                        164 non-null    object
 2   grand_prix                  168 non-null    object
 3   description                 168 non-null    object
 4   session_type                133 non-null    object
 5   track                       42 non-null     object
 6   lap_number                  13 non-null     object
 7   turn_number                 44 non-null     object
 8   safety_car_or_vsc_involved  14 non-null     object
 9   penalty_given               168 non-null    object
 10  type_of_incident            142 non-null    object
 11  was_contact_made            35 non-null     object
 12  immediate_advantage_gained  15 non-null     object
 13  drivers_involved       