# Klasifikasi Diamond

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Import Data

In [None]:
path_dataset = "diamonds.csv"
df = pd.read_csv(path_dataset)

# EDA

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [None]:
price_column = df.pop('price')
df['price'] = price_column

In [None]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335


In [None]:
df.describe()

Unnamed: 0,carat,depth,table,x,y,z,price
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,5.731157,5.734526,3.538734,3932.799722
std,0.474011,1.432621,2.234491,1.121761,1.142135,0.705699,3989.439738
min,0.2,43.0,43.0,0.0,0.0,0.0,326.0
25%,0.4,61.0,56.0,4.71,4.72,2.91,950.0
50%,0.7,61.8,57.0,5.7,5.71,3.53,2401.0
75%,1.04,62.5,59.0,6.54,6.54,4.04,5324.25
max,5.01,79.0,95.0,10.74,58.9,31.8,18823.0


In [None]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [None]:
label_encoder = LabelEncoder()

In [None]:
categorical_columns = ['cut', 'color', 'clarity']

# Apply label encoding to each categorical column
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

In [None]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,2,1,3,61.5,55.0,3.95,3.98,2.43,326
1,0.21,3,1,2,59.8,61.0,3.89,3.84,2.31,326
2,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31,327
3,0.29,3,5,5,62.4,58.0,4.2,4.23,2.63,334
4,0.31,1,6,3,63.3,58.0,4.34,4.35,2.75,335


# Training Model

In [None]:
# Memisahkan fitur dan target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
# Membagi data menjadi data latih dan data uji (data uji akan mengambil 20% dari total data, sedangkan data latih akan mengambil 80% sisanya.)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Membuat model LightGBM
model = LGBMRegressor()
# Melatih model
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001188 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1274
[LightGBM] [Info] Number of data points in the train set: 43152, number of used features: 9
[LightGBM] [Info] Start training from score 3939.490707


# Evaluasi Model

In [None]:
# Menghitung MSE dan R2 untuk data latih
y_train_pred = model.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

# Menghitung MSE dan R2 untuk data uji
y_test_pred = model.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

In [None]:
print(f"MSE (Train) : {np.round(mse_train * 100,2)} %")
print(f"MSE (Test)  : {np.round(mse_test * 100,2)} %")

print(f"R2 (Train) : {np.round(r2_train * 100,2)} %")
print(f"R2 (Test)  : {np.round(r2_test * 100,2)} %")

MSE (Train) : 23053945.65 %
MSE (Test)  : 28797183.63 %
R2 (Train) : 98.55 %
R2 (Test)  : 98.19 %


# Menyimpan Model

In [None]:
joblib.dump((model), "model_diaval.model")

['model_diaval.model']

# Prediksi

In [None]:
df_test = pd.DataFrame(data={
    'carat': [0.62],
    'cut': ['Premium'],
    'color': ['E'],
    'clarity': ['VS2'],
    'depth': [60.00],
    'table': [59.00],
    'x': [5.58],
    'y': [5.56],
    'z': [3.34]
})

df_test['cut'] = df_test['cut'].map({'Ideal': 0, 'Premium': 1, 'Very Good': 2, 'Good': 3, 'Fair': 4})
df_test['color'] = df_test['color'].map({'D': 0, 'E': 1, 'F': 2, 'G': 3, 'H': 4, 'I': 5, 'J': 6})
df_test['clarity'] = df_test['clarity'].map({'IF': 0, 'VVS1': 1, 'VVS2': 2, 'VS1': 3, 'VS2': 4, 'SI1': 5, 'SI2': 6, 'I1': 7})

df_test[0:1]

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.62,1,1,4,60.0,59.0,5.58,5.56,3.34


In [None]:
pred_test = model.predict(df_test[0:1])
pred_test[0]

2238.825786625577