In [None]:
# Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('data/cars_en.csv')
df.shape

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols].nunique()

In [None]:
df.isna().sum()

## Target Selection + Cleaning
### Brand

In [None]:
# Defining target variable
## Target Selection + Cleaning
target = 'Brand'
predictors = [col for col in df.columns if col != target]

print(f"Number of predictors: {len(predictors)}")
print(f"Predictors: {predictors}")

In [None]:
brand_counts = df['Brand'].value_counts()

# Target variable bar plot
plt.bar(brand_counts.index, brand_counts.values, color='salmon')
plt.xlabel('Brand')
plt.ylabel('Number of Cars')
plt.title('Distribution of Car Brands')
plt.xticks(rotation=45)
plt.tight_layout()
plt.grid(axis='y', alpha=0.75)

print(f"Total unique brands: {brand_counts.shape[0]}")
print(f"Top 10 brands:\n{brand_counts.head(10)}")

In [None]:
top_10_brands = brand_counts.head(10).index.tolist()

df_filtered = df[df['Brand'].isin(top_10_brands)]

print(f"Original dataset shape: {df.shape}")
print(f"Filtered dataset shape: {df_filtered.shape}")
print(f"Percentage of data retained: {df_filtered.shape[0] / df.shape[0] * 100:.2f}%")

df = df_filtered.reset_index(drop=True)

In [None]:
brand_counts = df['Brand'].value_counts()

# Visualizing the filtered target variable
plt.bar(brand_counts.index, brand_counts.values, color='salmon')
plt.xlabel('Brand')
plt.ylabel('Number of Cars')
plt.title('Distribution of Car Brands')
plt.xticks(rotation=45)
plt.tight_layout()
plt.grid(axis='y', alpha=0.75)

## Predictor Selection + Feature Cleaning

df = df.drop(columns=["ListingTitle", "District", 
    "ListingID", "Color", "PaintAndPartsCondition", 
    "TradeInAvailable", "SellerType", "VehicleTax(TRY)",
    "AccidentHistory"], errors="ignore")

In [None]:
df.isna().sum()

In [None]:
# Data cleaning 
df['Price(TRY)'] = df['Price(TRY)'].str.replace('TL', '', regex=False)  
df['Price(TRY)'] = df['Price(TRY)'].str.replace('.', '', regex=False)   
df['Price(TRY)'] = df['Price(TRY)'].str.strip()                         
df['Price(TRY)'] = df['Price(TRY)'].astype(int) 

df = df[df['Mileage(km)'].str.replace('.', '', regex=False)
                   .str.replace('km', '', regex=False)
                   .str.strip()
                   .str.isnumeric()]

df['Mileage(km)'] = df['Mileage(km)'].str.replace('km', '', regex=False)
df['Mileage(km)'] = df['Mileage(km)'].str.replace('.', '', regex=False)
df['Mileage(km)'] = df['Mileage(km)'].str.strip()
df['Mileage(km)'] = df['Mileage(km)'].astype(int)

df['EngineSize(cc)'] = df['EngineSize(cc)'].str.extract(r'(\d+)')   
df['EngineSize(cc)'] = pd.to_numeric(df['EngineSize(cc)'], errors='coerce')  
df['EngineSize(cc)'] = df['EngineSize(cc)'].astype('Int64')  

df['EnginePower(HP)'] = df['EnginePower(HP)'].str.extract(r'(\d+)')   
df['EnginePower(HP)'] = pd.to_numeric(df['EnginePower(HP)'], errors='coerce')  
df['EnginePower(HP)'] = df['EnginePower(HP)'].astype('Int64') 

df['ListingDate'] = pd.to_datetime(df['ListingDate'], format='%d %B %Y')
df['ListingYear'] = df['ListingDate'].dt.year
df['ListingMonth'] = df['ListingDate'].dt.month
df['ListingDay'] = df['ListingDate'].dt.day
df = df.drop('ListingDate', axis=1)

df['Year'] = df['Year'].astype(int)

df.info()

In [None]:
# Label Encoding for categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

df.head()
df.isna().sum()

In [None]:
df.to_csv('data/cars_cleaned.csv', index=False)