# Exercise 02: Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 1. Load data
df = pd.read_csv('../data/auto.csv', index_col='ID')
df.head()

Unnamed: 0_level_0,CarNumber,Make_n_Model,Fines,Refund,Bio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,E714KP21RUS,BMW 320,3194.0,1.0,
2,E911XK42RUS,Volkswagen Passat,1438.0,10.0,
3,E893AK16RUS,Skoda Octavia,,6.0,
4,P807TT161RUS,,1552.0,5.0,
5,K303XH61RUS,Skoda Octavia,3700.0,7.0,


In [3]:
# 2. Count observations
print(f"Original count: {df.shape[0]}")

Original count: 1050


In [4]:
# 3. Drop duplicates
df = df.drop_duplicates(subset=['CarNumber', 'Make_n_Model', 'Fines'], keep='last')
print(f"Count after dropping duplicates: {df.shape[0]}")

Count after dropping duplicates: 997


In [5]:
# 4. Missing values
print("Missing values per column:")
print(df.isnull().sum())

# Drop cols with > 500 missing
thresh = df.shape[0] - 500
df = df.dropna(thresh=thresh, axis=1)
print("\nMissing values after dropping cols with > 500 missing:")
print(df.isnull().sum())

Missing values per column:
CarNumber         0
Make_n_Model     92
Fines           113
Refund           86
Bio             997
dtype: int64

Missing values after dropping cols with > 500 missing:
CarNumber         0
Make_n_Model     92
Fines           113
Refund           86
dtype: int64


In [6]:
# Replace missing Refund with previous value
df['Refund'] = df['Refund'].ffill()
print("\nMissing Refund after fill:")
print(df['Refund'].isnull().sum())

# Replace missing Fines with mean
mean_fines = df['Fines'].mean()
df['Fines'] = df['Fines'].fillna(mean_fines)
print("\nMissing Fines after fill:")
print(df['Fines'].isnull().sum())


Missing Refund after fill:
0

Missing Fines after fill:
0


In [7]:
# 5. Split Make and Model
def split_make_model(val):
    if pd.isna(val):
        return pd.Series([np.nan, np.nan])
    parts = val.split(' ', 1)
    if len(parts) == 2:
        return pd.Series([parts[0], parts[1]])
    else:
        return pd.Series([parts[0], np.nan])

df[['Make', 'Model']] = df['Make_n_Model'].apply(split_make_model)
df = df.drop(columns=['Make_n_Model'])
df.head()

Unnamed: 0_level_0,CarNumber,Fines,Refund,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,E714KP21RUS,3194.0,1.0,BMW,320
2,E911XK42RUS,1438.0,10.0,Volkswagen,Passat
3,E893AK16RUS,2717.885747,6.0,Skoda,Octavia
4,P807TT161RUS,1552.0,5.0,,
5,K303XH61RUS,3700.0,7.0,Skoda,Octavia


In [8]:
# Save to JSON
df.to_json('auto.json', orient='records')