In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import pickle
import os
import shutil
import json  # Untuk menyimpan dan membaca mapping

Membaca data

In [2]:
motor= pd.read_csv('olx.csv')

**Pembersihan data**<br>
Konversi kolom year menjadi integer

In [3]:
motor['year'] = motor['year'].astype(int)

In [4]:
motor.info

<bound method DataFrame.info of     company       name  year kms_driven       price fuel_type  unnamed: 6
0     Honda      Vario  2011   5000 kms   7.500.000    bensin         NaN
1     Honda       Beat  2018  45000 kms  15.300.000    bensin         NaN
2    Yamaha        Mio  2022   5000 kms  13.300.000    bensin         NaN
3     Honda     Scoopy  2015  25000 kms  12.400.000    bensin         NaN
4     Honda     Scoopy  2015  25000 kms  12.400.000    bensin         NaN
..      ...        ...   ...        ...         ...       ...         ...
845   Smoot  Lain-lain  2020   5000 kms   2.500.000   listrik         NaN
846   Smoot  Lain-lain  2023   5000 kms  10.300.000   listrik         NaN
847   Honda     Scoopy  2018      25000    14500000    bensin         NaN
848   Honda       Revo  2020      10000    10000000    bensin         NaN
849   Honda     Scoopy  2025       1000    24000000    bensin         NaN

[850 rows x 7 columns]>

Menghapus pemisah ribuan pada Price dan mengubahnya ke integer

In [5]:
# Pastikan harga diformat dengan benar dan hapus '.0' jika ada
motor['price'] = motor['price'].astype(str)  # Pastikan tipe data string
motor['price'] = motor['price'].str.replace('.', '', regex=False)  # Hapus semua titik
motor['price'] = motor['price'].str.replace(',', '', regex=False)  # Hapus semua koma
motor['price'] = motor['price'].astype(int).astype(str)  # Konversi ke integer


Menghapus karakter non-numerik dari kolom kms_driven

In [6]:
motor['kms_driven'] = motor['kms_driven'].str.split(" ").str.get(0).str.replace(',', '')
motor = motor[motor['kms_driven'].str.isnumeric()]
motor['kms_driven'] = motor['kms_driven'].astype(int)

Menghapus nilai missing pada kolom fuel_type

In [7]:
motor = motor[~motor['fuel_type'].isna()]

**Proses nama menjadi angka berdasarkan frekuensi**

In [8]:
columns_to_encode = ['company', 'name', 'fuel_type']
mappings = {}

In [9]:
for col in columns_to_encode:
    mappings[col] = {k: i for i, k in enumerate(motor[col].value_counts().index, 1)}
    motor[col] = motor[col].map(mappings[col])

Menyimpan mapping ke file JSON

In [10]:
with open('mappings.json', 'w') as f:
    json.dump(mappings, f)

Menyimpan dataset yang sudah dibersihkan

In [11]:
motor.to_csv('olx_cleaned.csv', index=False)

**Melatih model regresi linier**<br>
Pastikan folder 'file_pkl' ada

In [12]:
if not os.path.exists('file_pkl'):
    os.makedirs('file_pkl')

Function untuk melatih model

In [13]:
def train_model(company, motor_model, fuel_type):
    # Filter data untuk kombinasi tertentu
    filtered_motor = motor[
        (motor['company'] == company) & 
        (motor['name'] == motor_model) & 
        (motor['fuel_type'] == fuel_type)
    ]

    # Pastikan data cukup untuk melatih model
    if len(filtered_motor) < 2:
        print(f"Not enough data for company: {company}, model: {motor_model}, fuel: {fuel_type}")
        return None

    # Define fitur dan target
    X = filtered_motor[['year', 'kms_driven', 'company', 'name', 'fuel_type']]
    y = filtered_motor['price']

    # Melatih model
    model = LinearRegression()
    model.fit(X, y)

    # Mendapatkan nama asli dari encoding
    company_name = [k for k, v in mappings['company'].items() if v == company][0]
    model_name = [k for k, v in mappings['name'].items() if v == motor_model][0]
    fuel_name = [k for k, v in mappings['fuel_type'].items() if v == fuel_type][0]

    # Menyimpan model
    filename = f"MotorPriceModel_{company_name}_{model_name}_{fuel_name}.pkl"
    pickle.dump(model, open(filename, 'wb'))
    shutil.move(filename, os.path.join('file_pkl', filename))
    print(f"Model saved: file_pkl/{filename}")
    return filename

Melatih model untuk setiap kombinasi unik

In [14]:
for company in motor['company'].unique():
    for motor_model in motor[motor['company'] == company]['name'].unique():
        for fuel_type in motor[
            (motor['company'] == company) & (motor['name'] == motor_model)
        ]['fuel_type'].unique():
            train_model(company, motor_model, fuel_type)

Model saved: file_pkl/MotorPriceModel_Honda_Vario_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Honda_Beat_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Honda_Scoopy_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Honda_CBR_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Honda_PCX_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Honda_Supra_bensin.pkl
Not enough data for company: 1, model: 58, fuel: 1
Model saved: file_pkl/MotorPriceModel_Honda_CB 150R_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Honda_Forza_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Honda_CB_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Honda_Sonic_bensin.pkl
Not enough data for company: 1, model: 69, fuel: 1
Not enough data for company: 1, model: 68, fuel: 1
Not enough data for company: 1, model: 67, fuel: 1
Not enough data for company: 1, model: 61, fuel: 1
Not enough data for company: 1, model: 59, fuel: 1
Model saved: file_pkl/MotorPriceModel_Honda_CRF250Rally_bensin.pkl
Model saved: file_pkl/

Model saved: file_pkl/MotorPriceModel_Yamaha_Aerox_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Yamaha_Freego_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Yamaha_Soul GT_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Yamaha_YZF R15_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Yamaha_MT 25_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Yamaha_Xabre_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Yamaha_Byson_bensin.pkl
Not enough data for company: 2, model: 65, fuel: 1
Model saved: file_pkl/MotorPriceModel_Yamaha_Scorpio_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Yamaha_F 1 ZR_bensin.pkl
Not enough data for company: 2, model: 63, fuel: 1
Not enough data for company: 2, model: 60, fuel: 1
Model saved: file_pkl/MotorPriceModel_Yamaha_YZF R25_bensin.pkl
Model saved: file_pkl/MotorPriceModel_Yamaha_WR 250_bensin.pkl
Not enough data for company: 2, model: 54, fuel: 1
Not enough data for company: 2, model: 56, fuel: 1
Model saved: file_pkl/MotorPriceModel_Kawasaki_Ninja

Model saved: file_pkl/MotorPriceModel_Suzuki_GSX_bensin.pkl
Not enough data for company: 7, model: 71, fuel: 1
Model saved: file_pkl/MotorPriceModel_Suzuki_Shogun_bensin.pkl
Not enough data for company: 7, model: 50, fuel: 1
Not enough data for company: 18, model: 53, fuel: 1
Not enough data for company: 18, model: 52, fuel: 1
Model saved: file_pkl/MotorPriceModel_Yadea_Lain-lain_listrik.pkl
Model saved: file_pkl/MotorPriceModel_Uwinifly_Lain-lain_listrik.pkl
Model saved: file_pkl/MotorPriceModel_Alva_Lain-lain_listrik.pkl
Model saved: file_pkl/MotorPriceModel_Smoot_Lain-lain_listrik.pkl
Model saved: file_pkl/MotorPriceModel_Goda_Lain-lain_listrik.pkl
Model saved: file_pkl/MotorPriceModel_Gesits_Lain-lain_listrik.pkl
Not enough data for company: 19, model: 1, fuel: 2
Model saved: file_pkl/MotorPriceModel_Genio_Lain-lain_listrik.pkl
Model saved: file_pkl/MotorPriceModel_Exsotik_Lain-lain_listrik.pkl
Model saved: file_pkl/MotorPriceModel_Volta_Lain-lain_listrik.pkl
Not enough data for co

**Mengembalikan nama asli untuk dataset**<br>
Membaca kembali mapping dari file JSON

In [15]:
with open('mappings.json', 'r') as f:
    mappings = json.load(f)

In [16]:
for col in columns_to_encode:
    reverse_mapping = {v: k for k, v in mappings[col].items()}
    motor[col] = motor[col].map(reverse_mapping)

Menyimpan dataset yang sudah diubah kembali

In [17]:
motor.to_csv('olx_cleaned2.csv', index=False)

In [18]:
print("Proses selesai.")

Proses selesai.
