In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

data = pd.read_csv('Automobile.csv')

data.head()

Unnamed: 0,name,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,70,usa
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,70,usa
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,70,usa
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,70,usa
4,ford torino,17.0,8,302.0,140.0,3449,10.5,70,usa


# 1. Data Cleaning
### Mengganti value yang hilang di kolom horsepower dengan median

In [27]:
data['horsepower'].fillna(data['horsepower'].median(), inplace=True)

### Handling duplicate value

In [28]:
print(data.duplicated().sum())
data = data.drop_duplicates()

0


### Handling outlier

In [30]:
kolom_num = data.select_dtypes(include=[np.number]) # Memilih kolom numerik

Q1 = kolom_num.quantile(0.25)# Menggunakan IQR untuk mendeteksi outlier
Q3 = kolom_num.quantile(0.75)
IQR = Q3 - Q1

# Menghapus outlier
outlier_condition = (kolom_num < (Q1 - 1.5 * IQR)) | (kolom_num > (Q3 + 1.5 * IQR))
data_cleaned = data[~outlier_condition.any(axis=1)]

print(data_cleaned) # Menampilkan data yang sudah di clean

                          name   mpg  cylinders  displacement  horsepower  \
0    chevrolet chevelle malibu  18.0          8         307.0       130.0   
1            buick skylark 320  15.0          8         350.0       165.0   
2           plymouth satellite  18.0          8         318.0       150.0   
3                amc rebel sst  16.0          8         304.0       150.0   
4                  ford torino  17.0          8         302.0       140.0   
..                         ...   ...        ...           ...         ...   
392           chevrolet camaro  27.0          4         151.0        90.0   
393            ford mustang gl  27.0          4         140.0        86.0   
395              dodge rampage  32.0          4         135.0        84.0   
396                ford ranger  28.0          4         120.0        79.0   
397                 chevy s-10  31.0          4         119.0        82.0   

     weight  acceleration  model_year origin  
0      3504          12.0   

# 2. Menampilkan data yang sudah di clean

In [31]:
kolom_num = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration'] # Menentukan kolom numerik yang kita punya

# Normalisasi menggunakan MinMaxScaler
scaler = MinMaxScaler()
data[kolom_num] = scaler.fit_transform(data[kolom_num])

print(data.head()) # Menampilkan beberapa data

                        name       mpg  cylinders  displacement  horsepower  \
0  chevrolet chevelle malibu  0.239362        1.0      0.617571    0.456522   
1          buick skylark 320  0.159574        1.0      0.728682    0.646739   
2         plymouth satellite  0.239362        1.0      0.645995    0.565217   
3              amc rebel sst  0.186170        1.0      0.609819    0.565217   
4                ford torino  0.212766        1.0      0.604651    0.510870   

     weight  acceleration  model_year origin  
0  0.536150      0.238095          70    usa  
1  0.589736      0.208333          70    usa  
2  0.516870      0.178571          70    usa  
3  0.516019      0.238095          70    usa  
4  0.520556      0.148810          70    usa  


# 3. Encoding kolom kategorikal

In [32]:
data = pd.get_dummies(data, columns=['origin'], drop_first=True) # Menggunakan One Hot Encoding untuk kolom 'origin' kita
print(data.head()) # Menampilkan beberapa data kita setelah One hot

                        name       mpg  cylinders  displacement  horsepower  \
0  chevrolet chevelle malibu  0.239362        1.0      0.617571    0.456522   
1          buick skylark 320  0.159574        1.0      0.728682    0.646739   
2         plymouth satellite  0.239362        1.0      0.645995    0.565217   
3              amc rebel sst  0.186170        1.0      0.609819    0.565217   
4                ford torino  0.212766        1.0      0.604651    0.510870   

     weight  acceleration  model_year  origin_japan  origin_usa  
0  0.536150      0.238095          70         False        True  
1  0.589736      0.208333          70         False        True  
2  0.516870      0.178571          70         False        True  
3  0.516019      0.238095          70         False        True  
4  0.520556      0.148810          70         False        True  


# 4. Feature engineering (minimal 1 feature baru)
###  fitur baru kita adalah power to weight ratio yang membandingkan horsepower dan berat mobil yang kita miliki.

In [33]:
data['power_to_weight_ratio'] = data['horsepower'] / data['weight'] # Fitur baru 'power_to_weight_ratio'
print(data[['horsepower', 'weight', 'power_to_weight_ratio']].head()) # Menampilkan datanya

   horsepower    weight  power_to_weight_ratio
0    0.456522  0.536150               0.851482
1    0.646739  0.589736               1.096658
2    0.565217  0.516870               1.093539
3    0.565217  0.516019               1.095342
4    0.510870  0.520556               0.981393


# 5. Splitting data ke dalam data training & data testing

In [34]:
# Memisahkan fitur dan targetnya
X = data.drop(columns=['name'])
y = data['mpg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) # Membagi data yang kita punya

# Menampilkan training
print("Data Training (X_train):")
print(X_train.head())

print("\nTarget Training (y_train):")
print(y_train.head())

# Menampilkan testing
print("\nData Testing (X_test):")
print(X_test.head())

print("\nTarget Testing (y_test):")
print(y_test.head())

Data Training (X_train):
          mpg  cylinders  displacement  horsepower    weight  acceleration  \
299  0.484043        0.2      0.188630    0.135870  0.447122      1.000000   
211  0.199468        0.6      0.258398    0.402174  0.625744      0.517857   
167  0.531915        0.2      0.074935    0.157609  0.158208      0.476190   
165  0.292553        1.0      0.501292    0.347826  0.455912      0.327381   
89   0.159574        1.0      0.645995    0.565217  0.613553      0.267857   

     model_year  origin_japan  origin_usa  power_to_weight_ratio  
299          79         False       False               0.303876  
211          76         False       False               0.642713  
167          75          True       False               0.996211  
165          75         False        True               0.762925  
89           73         False        True               0.921221  

Target Training (y_train):
299    0.484043
211    0.199468
167    0.531915
165    0.292553
89     0.159