In [17]:
import pandas as pd
import numpy as np
import re

In [11]:
df=pd.read_csv('turbo_az_december_2024.csv')

In [18]:
# Columns we'll use for clustering (with cleaning)
cluster_features = ['price_azn', 'year', 'mileage', 'engine', 'horse_power', 'n_views']

# Standardize column names (if not done already)
df.columns = df.columns.str.lower()

# 2. Enhanced cleaning function
df['mileage'] = (
    df['mileage']
    .str.replace('km', '', regex=False)  # Remove 'km'
    .str.replace(' ', '')                # Remove spaces
    .astype(int)                         # Convert to numeric
)

df['engine'] = (
    df['engine']
    .str.replace('L', '', regex=False)  # Remove 'L'
    .str.replace(' ', '')              # Remove spaces
    .astype(float)                     # Convert to float
)

df['horse_power'] = (
    df['horse_power']
    .str.replace('a.g.', '', regex=False)  # Remove 'km'
    .str.replace(' ', '')                # Remove spaces
    .astype(float)                         # Convert to numeric
)

# Convert other features to numeric (safe conversion)
for col in ['price_azn', 'year', 'n_views']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 3. Handle missing values (same as before)
print("Missing values before handling:")
print(df[cluster_features].isna().sum())

# Strategy: Drop rows with missing critical features (price, year)
df = df.dropna(subset=['price_azn', 'year'])

# For other features, impute median (preserves distribution)
for col in ['mileage', 'engine', 'horse_power']:
    median_val = df[col].median()
    print(f"Imputing {col} with median: {median_val}")
    df[col] = df[col].fillna(median_val)

# 4. Final verification (enhanced)
print("\nMissing values after cleaning:")
print(df[cluster_features].isna().sum())

print("\nData types:")
print(df[cluster_features].dtypes)

print("\nSample cleaned data:")
print(df[cluster_features].head())

print("\nDescriptive statistics:")
print(df[cluster_features].describe())

Missing values before handling:
price_azn       0
year            0
mileage         0
engine         49
horse_power    49
n_views         0
dtype: int64
Imputing mileage with median: 146000.0
Imputing engine with median: 2.0
Imputing horse_power with median: 173.0

Missing values after cleaning:
price_azn      0
year           0
mileage        0
engine         0
horse_power    0
n_views        0
dtype: int64

Data types:
price_azn        int64
year             int64
mileage          int64
engine         float64
horse_power    float64
n_views          int64
dtype: object

Sample cleaned data:
   price_azn  year  mileage  engine  horse_power  n_views
0      21900  2020    98000     1.5         74.0     3654
1     110160  2017     2000     3.6        300.0      824
2      30583  2024        0     1.5        238.0     8389
3      18800  2018    77000     1.2         98.0     1371
4      23900  2016   103000     1.5        116.0     3505

Descriptive statistics:
           price_azn        

In [19]:
df['mileage'].head()

0     98000
1      2000
2         0
3     77000
4    103000
Name: mileage, dtype: int64

In [14]:
df.to_csv("cleaned_turbo_az.csv")