In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("cars.csv")

In [3]:
df.head()

Unnamed: 0,First Name,Last Name,Country,Car Brand,Car Model,Car Color,Year of Manufacture,Credit Card Type
0,Yetty,Arghent,Indonesia,Ford,Club Wagon,Teal,1993,mastercard
1,Crystal,Bosworth,China,Cadillac,Escalade ESV,Fuscia,2007,mastercard
2,Monro,Houdhury,Indonesia,Mazda,Miata MX-5,Orange,2009,maestro
3,Bowie,Clair,China,Audi,A4,Orange,2005,instapayment
4,Myrvyn,McAllister,Czech Republic,Nissan,Maxima,Pink,1994,maestro


In [4]:
df.shape

(30000, 8)

In [5]:
df.isna().sum()

First Name             0
Last Name              0
Country                0
Car Brand              0
Car Model              0
Car Color              0
Year of Manufacture    0
Credit Card Type       0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   First Name           30000 non-null  object
 1   Last Name            30000 non-null  object
 2   Country              30000 non-null  object
 3   Car Brand            30000 non-null  object
 4   Car Model            30000 non-null  object
 5   Car Color            30000 non-null  object
 6   Year of Manufacture  30000 non-null  int64 
 7   Credit Card Type     30000 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.8+ MB


In [7]:
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
)
df.columns

Index(['first_name', 'last_name', 'country', 'car_brand', 'car_model',
       'car_color', 'year_of_manufacture', 'credit_card_type'],
      dtype='object')

In [8]:
df_ml = df.drop(columns=["first_name", "last_name"])

In [9]:
categorical_cols = df.select_dtypes(include="object").columns

for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [10]:
df["year_of_manufacture"].fillna(
    df["year_of_manufacture"].median(),
    inplace=True
)

In [11]:
df = df.drop_duplicates()

In [12]:
df = df[
    (df["year_of_manufacture"] >= 1980) &
    (df["year_of_manufacture"] <= 2025)
]

In [13]:
for col in categorical_cols:
    df[col] = df[col].str.strip().str.lower()

In [14]:
df_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   country              30000 non-null  object
 1   car_brand            30000 non-null  object
 2   car_model            30000 non-null  object
 3   car_color            30000 non-null  object
 4   year_of_manufacture  30000 non-null  int64 
 5   credit_card_type     30000 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.4+ MB


In [15]:
df_ml.head()

Unnamed: 0,country,car_brand,car_model,car_color,year_of_manufacture,credit_card_type
0,Indonesia,Ford,Club Wagon,Teal,1993,mastercard
1,China,Cadillac,Escalade ESV,Fuscia,2007,mastercard
2,Indonesia,Mazda,Miata MX-5,Orange,2009,maestro
3,China,Audi,A4,Orange,2005,instapayment
4,Czech Republic,Nissan,Maxima,Pink,1994,maestro


In [16]:

TARGET = "car_brand"

FEATURES = [
    "country",
    "car_model",
    "car_color",
    "year_of_manufacture",
    "credit_card_type"
]


In [17]:
# Top 5 car brands
df_ml["car_brand"].value_counts().head(5)

car_brand
Ford         2562
Chevrolet    2487
Toyota       1445
Dodge        1438
GMC          1296
Name: count, dtype: int64

In [18]:
# Country-wise car distribution
df_ml["country"].value_counts().head(10)

country
China            5465
Indonesia        3163
Russia           1671
Philippines      1636
Brazil           1196
Poland           1039
Portugal          987
France            854
Sweden            778
United States     633
Name: count, dtype: int64

In [19]:
# Year-wise distribution
df_ml["year_of_manufacture"].value_counts().sort_index()

year_of_manufacture
1909       1
1926       4
1948       5
1950       1
1953       6
        ... 
2009    1572
2010    1382
2011    1341
2012    1405
2013     242
Name: count, Length: 65, dtype: int64

In [20]:
df_ml.to_csv("cars_cleaned.csv", index=False)

In [21]:
import pandas as pd

df = pd.read_csv("cars_cleaned.csv")

In [22]:
df.head()

Unnamed: 0,country,car_brand,car_model,car_color,year_of_manufacture,credit_card_type
0,Indonesia,Ford,Club Wagon,Teal,1993,mastercard
1,China,Cadillac,Escalade ESV,Fuscia,2007,mastercard
2,Indonesia,Mazda,Miata MX-5,Orange,2009,maestro
3,China,Audi,A4,Orange,2005,instapayment
4,Czech Republic,Nissan,Maxima,Pink,1994,maestro


In [23]:
TARGET = "car_brand"

FEATURES = [
    "country",
    "car_model",
    "car_color",
    "year_of_manufacture",
    "credit_card_type"
]

X = df[FEATURES]
y = df[TARGET]


In [31]:
# Re-encode categorical features
label_encoders = {}
X_encoded = X.copy()

from sklearn.preprocessing import LabelEncoder

for col in X_encoded.select_dtypes(include="object").columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])
    label_encoders[col] = le



In [32]:
# Encode target
target_encoder = LabelEncoder()
y_encoded = target_encoder.fit_transform(y)
label_encoders["car_brand"] = target_encoder

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y_encoded,
    test_size=0.2,
    random_state=42
)


In [34]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)


In [35]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7146666666666667
              precision    recall  f1-score   support

           0       0.79      0.65      0.71       105
           1       1.00      0.56      0.71         9
           2       0.00      0.00      0.00         1
           3       0.71      0.59      0.64        49
           4       0.84      0.79      0.81       167
           5       1.00      1.00      1.00         1
           6       0.89      0.91      0.90       192
           7       0.76      0.66      0.70        38
           9       0.70      0.67      0.68       140
          10       0.83      0.76      0.79        99
          11       0.62      0.84      0.72       484
          12       0.51      0.46      0.48       106
          13       1.00      0.40      0.57         5
          14       0.00      0.00      0.00         2
          15       0.17      0.12      0.14         8
          16       0.00      0.00      0.00         2
          17       0.81      0.75      0.78       31

In [36]:
import os

os.makedirs("../model", exist_ok=True)

# Save model
with open("../model/car_brand_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save encoders
with open("../model/label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

print("✅ Model and encoders saved successfully")


✅ Model and encoders saved successfully
