In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sumedh1507/bmw-car-sales-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/bmw-car-sales-dataset


In [5]:
import pandas as pd

# Assuming the dataset file is named 'BMW_Car_Sales_Classification.csv' based on the files found in the directory
try:
    df = pd.read_csv(f"{path}/BMW_Car_Sales_Classification.csv")
    print("Dataset head:")
    display(df.head())
    print("\nDataset shape (rows, columns):")
    print(df.shape)
except FileNotFoundError:
    print(f"Error: Make sure 'BMW_Car_Sales_Classification.csv' exists in the downloaded path: {path}")
except Exception as e:
    print(f"An error occurred: {e}")

Dataset head:


Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,5 Series,2016,Asia,Red,Petrol,Manual,3.5,151748,98740,8300,High
1,i8,2013,North America,Red,Hybrid,Automatic,1.6,121671,79219,3428,Low
2,5 Series,2022,North America,Blue,Petrol,Automatic,4.5,10991,113265,6994,Low
3,X3,2024,Middle East,Blue,Petrol,Automatic,1.7,27255,60971,4047,Low
4,7 Series,2020,South America,Black,Diesel,Manual,2.1,122131,49898,3080,Low



Dataset shape (rows, columns):
(50000, 11)


In [6]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Check for duplicate rows
print("\nNumber of duplicate rows:")
print(df.duplicated().sum())

# Check for outliers in numerical columns using describe() as a first step
print("\nDescriptive statistics for numerical columns (check for potential outliers):")
display(df.describe())

Missing values per column:
Model                   0
Year                    0
Region                  0
Color                   0
Fuel_Type               0
Transmission            0
Engine_Size_L           0
Mileage_KM              0
Price_USD               0
Sales_Volume            0
Sales_Classification    0
dtype: int64

Number of duplicate rows:
0

Descriptive statistics for numerical columns (check for potential outliers):


Unnamed: 0,Year,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume
count,50000.0,50000.0,50000.0,50000.0,50000.0
mean,2017.0157,3.24718,100307.20314,75034.6009,5067.51468
std,4.324459,1.009078,57941.509344,25998.248882,2856.767125
min,2010.0,1.5,3.0,30000.0,100.0
25%,2013.0,2.4,50178.0,52434.75,2588.0
50%,2017.0,3.2,100388.5,75011.5,5087.0
75%,2021.0,4.1,150630.25,97628.25,7537.25
max,2024.0,5.0,199996.0,119998.0,9999.0


In [7]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop('Sales_Classification', axis=1)
y = df['Sales_Classification']

# Perform train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (40000, 10)
Shape of X_test: (10000, 10)
Shape of y_train: (40000,)
Shape of y_test: (10000,)


In [3]:
from pycaret.classification import setup

# Initialize the setup
clf_setup = setup(data=df, target='Sales_Classification', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Sales_Classification
2,Target type,Binary
3,Target mapping,"High: 0, Low: 1"
4,Original data shape,"(50000, 11)"
5,Transformed data shape,"(50000, 34)"
6,Transformed train set shape,"(35000, 34)"
7,Transformed test set shape,"(15000, 34)"
8,Numeric features,5
9,Categorical features,5


In [5]:
from pycaret.classification import setup

# Initialize the setup
clf_setup = setup(data=df, target='Sales_Classification', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Sales_Classification
2,Target type,Binary
3,Target mapping,"High: 0, Low: 1"
4,Original data shape,"(50000, 11)"
5,Transformed data shape,"(50000, 34)"
6,Transformed train set shape,"(35000, 34)"
7,Transformed test set shape,"(15000, 34)"
8,Numeric features,5
9,Categorical features,5


In [4]:
from pycaret.classification import compare_models

# Compare different classification models
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.417
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,0.9999,0.9999,2.484
ada,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.425
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.13
xgboost,Extreme Gradient Boosting,0.9999,1.0,0.9999,0.9999,0.9999,0.9997,0.9997,0.691
lightgbm,Light Gradient Boosting Machine,0.9994,1.0,0.9994,0.9994,0.9994,0.9985,0.9985,3.283
nb,Naive Bayes,0.9993,1.0,0.9993,0.9993,0.9993,0.9985,0.9985,0.548
lr,Logistic Regression,0.9987,1.0,0.9987,0.9987,0.9987,0.997,0.997,1.606
et,Extra Trees Classifier,0.9871,0.9994,0.9871,0.9871,0.987,0.9693,0.9694,3.672
svm,SVM - Linear Kernel,0.9611,0.9976,0.9611,0.964,0.96,0.9042,0.9096,1.127


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [6]:
from pycaret.classification import create_model, tune_model, predict_model, evaluate_model

# Create an SVM model
svm_model = create_model('svm')

# Tune the SVM model (optional, but recommended for better performance)
tuned_svm = tune_model(svm_model)

# Evaluate the tuned model
evaluate_model(tuned_svm)

# Make predictions on the test set
predictions = predict_model(tuned_svm)

# Display the predictions
print("\nPredictions on the test set:")
display(predictions.head())

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9794,0.9995,0.9794,0.98,0.9792,0.9506,0.9517
1,0.9869,0.9997,0.9869,0.9871,0.9868,0.9686,0.9691
2,0.9791,0.9987,0.9791,0.9791,0.9791,0.9506,0.9507
3,0.9826,0.9996,0.9826,0.9834,0.9827,0.9595,0.9601
4,0.9654,0.9999,0.9654,0.9689,0.9659,0.9209,0.9238
5,0.9689,0.9986,0.9689,0.9701,0.9684,0.9244,0.9269
6,0.9394,0.9967,0.9394,0.944,0.9374,0.8488,0.8583
7,0.9109,0.9911,0.9109,0.921,0.9059,0.7709,0.7919
8,0.9217,0.9929,0.9217,0.9286,0.9182,0.8018,0.8166
9,0.9766,0.9993,0.9766,0.9779,0.9768,0.9457,0.9468


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9997,1.0,0.9997,0.9997,0.9997,0.9993,0.9993
1,0.9989,1.0,0.9989,0.9989,0.9989,0.9973,0.9973
2,0.9997,1.0,0.9997,0.9997,0.9997,0.9993,0.9993
3,0.998,1.0,0.998,0.998,0.998,0.9953,0.9953
4,0.9971,1.0,0.9971,0.9971,0.9971,0.9933,0.9933
5,0.9989,1.0,0.9989,0.9989,0.9989,0.9973,0.9973
6,0.9997,1.0,0.9997,0.9997,0.9997,0.9993,0.9993
7,0.9991,1.0,0.9991,0.9991,0.9991,0.998,0.998
8,0.9991,1.0,0.9991,0.9991,0.9991,0.998,0.998
9,0.9977,1.0,0.9977,0.9977,0.9977,0.9946,0.9946


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.9985,0,0.9985,0.9985,0.9985,0.9965,0.9965



Predictions on the test set:


Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification,prediction_label
18916,i8,2013,South America,Grey,Petrol,Manual,4.2,128115,85133,8949,High,High
24406,7 Series,2023,North America,Blue,Diesel,Automatic,4.0,4341,116817,5893,Low,Low
33484,3 Series,2019,North America,Red,Petrol,Automatic,3.6,39213,99751,9426,High,High
41755,M3,2012,North America,Blue,Petrol,Automatic,2.1,72482,70407,3751,Low,Low
1168,X5,2023,Asia,Red,Electric,Manual,3.5,115949,31097,4379,Low,Low




In [8]:
from pycaret.classification import save_model

# Save the tuned SVM model
save_model(tuned_svm, 'tuned_svm_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['Year', 'Engine_Size_L',
                                              'Mileage_KM', 'Price_USD',
                                              'Sales_Volume'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep...
                  SGDClassifier(alpha=1e-07, average=False, class_weight=None,
                                early_stopping=False, epsilon=0.1, eta0=0.5,
        

In [16]:
import os

# The save_model function in PyCaret saves the model in the current working directory
# The output of the save_model function also confirms the filename
model_filename = 'tuned_svm_model.pkl'
current_directory = os.getcwd()
model_path = os.path.join(current_directory, model_filename)

print(f"Path to the saved model: {model_path}")

Path to the saved model: /content/tuned_svm_model.pkl


In [None]:
import streamlit as st
import pandas as pd
from pycaret.classification import load_model, predict_model

# Set page title and icon
st.set_page_config(page_title="BMW Car Sales Prediction App", page_icon=":car:")

# Load the trained PyCaret model
# Make sure 'tuned_svm_model.pkl' is in the same directory as your app.py file,
# or provide the full path to the model file.
try:
    model = load_model('tuned_svm_model')
except Exception as e:
    st.error(f"Error loading model: {e}")
    st.stop() # Stop the app if model loading fails

st.title("Aplikasi Prediksi Penjualan Mobil BMW")
st.write("Masukkan detail mobil untuk memprediksi Klasifikasi Penjualannya (High/Low). Anda bisa memasukkan data untuk beberapa mobil sekaligus.")

# --- Input Section ---
st.header("Masukkan Data Mobil")

# Option to upload a CSV file or input data manually
input_method = st.radio("Pilih metode input data:", ("Input Manual", "Upload File CSV"))

data_input = None

if input_method == "Input Manual":
    st.subheader("Input Data Secara Manual")
    # Create input fields for each feature
    # You can add more features based on your model's requirements
    model_name = st.selectbox("Model", ['5 Series', 'i8', 'X3', '7 Series', 'X5', 'M3', 'i3', 'X1', '4 Series', '6 Series']) # Replace with actual models from your data
    year = st.number_input("Tahun", min_value=2010, max_value=2024, value=2020)
    region = st.selectbox("Wilayah", ['Asia', 'North America', 'Middle East', 'South America', 'Europe', 'Africa']) # Replace with actual regions
    color = st.selectbox("Warna", ['Red', 'Blue', 'Black', 'Silver', 'White', 'Grey']) # Replace with actual colors
    fuel_type = st.selectbox("Jenis Bahan Bakar", ['Petrol', 'Hybrid', 'Diesel', 'Electric']) # Replace with actual fuel types
    transmission = st.selectbox("Transmisi", ['Manual', 'Automatic'])
    engine_size = st.number_input("Ukuran Mesin (L)", min_value=1.0, max_value=6.0, value=3.0, step=0.1)
    mileage = st.number_input("Jarak Tempuh (KM)", min_value=0, value=50000)
    price_usd = st.number_input("Harga (USD)", min_value=10000, value=50000)
    sales_volume = st.number_input("Volume Penjualan", min_value=100, value=5000)

    # Create a dictionary for the input data
    input_data_dict = {
        'Model': [model_name],
        'Year': [year],
        'Region': [region],
        'Color': [color],
        'Fuel_Type': [fuel_type],
        'Transmission': [transmission],
        'Engine_Size_L': [engine_size],
        'Mileage_KM': [mileage],
        'Price_USD': [price_usd],
        'Sales_Volume': [sales_volume]
    }
    data_input = pd.DataFrame(input_data_dict)

elif input_method == "Upload File CSV":
    st.subheader("Upload File CSV")
    uploaded_file = st.file_uploader("Pilih file CSV", type="csv")
    if uploaded_file is not None:
        try:
            data_input = pd.read_csv(uploaded_file)
            st.write("Data dari file CSV:")
            st.dataframe(data_input.head())
        except Exception as e:
            st.error(f"Error membaca file CSV: {e}")


# --- Prediction Section ---
if data_input is not None:
    if st.button("Prediksi"):
        try:
            # Make predictions using the loaded model
            predictions = predict_model(model, data=data_input)

            # Display the predictions
            st.subheader("Hasil Prediksi")
            # Rename the prediction column for clarity
            predictions = predictions.rename(columns={'prediction_label': 'Predicted_Sales_Classification'})
            st.dataframe(predictions[['Model', 'Year', 'Predicted_Sales_Classification']])

        except Exception as e:
            st.error(f"Terjadi kesalahan saat melakukan prediksi: {e}")