In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv('datasets.csv')
data.head()


Unnamed: 0,Customer_ID,Product_ID,Transaction_ID,Purchase_Frequency,Average_Order_Value,Most_Frequent_Category,Time_Between_Purchases,Region,Churn_Probability,Lifetime_Value,Launch_Date,Peak_Sales_Date,Season,Preferred_Purchase_Times,Retention_Strategy
0,CUST_9HOS83,PROD_IK97D1,TRANS_II1DZG,17,172.57,Electronics,45,South America,0.98,2993.56,2020-03-14,2023-09-11,Winter,Afternoon,Loyalty Program
1,CUST_AJU17N,PROD_UNN7KP,TRANS_9HJF7I,10,64.89,Clothing,6,South America,0.66,983.18,2022-10-15,2023-01-02,Spring,Afternoon,Discount
2,CUST_11XNYF,PROD_0XEW2W,TRANS_OT96OM,3,120.38,Sports,23,Asia,0.6,601.9,2021-11-30,2023-04-06,Winter,Evening,Loyalty Program
3,CUST_IGH8G3,PROD_3IIAJN,TRANS_45V00G,12,70.34,Clothing,5,North America,0.78,1082.15,2022-03-20,2023-03-23,Spring,Evening,Discount
4,CUST_OK6PUM,PROD_VMIWD2,TRANS_ZAK760,18,42.39,Electronics,10,North America,0.52,1467.35,2022-11-09,2023-10-28,Spring,Morning,Discount


In [8]:
features = ['Purchase_Frequency', 'Average_Order_Value', 'Churn_Probability', 'Time_Between_Purchases']
target = 'Lifetime_Value'

# Mengisi nilai kosong dengan nilai rata-rata pada setiap kolom/fitur yang dipilih
for feature in features:
    data[feature] = data[feature].fillna(data[feature].mean())

# Melakukan handling Outlier menggunakan metode IQR
for feature in features:
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    data[feature] = data[feature].clip(lower_bound, upper_bound)

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Absolute Error (MAE):', mae)
print('R-squared (R2):', r2)

mae_linear = mae
r2_linear = r2

Mean Absolute Error (MAE): 5.430626250866519e-13
R-squared (R2): 1.0


In [None]:
model = RandomForestRegressor(random_state=42, n_estimators=120)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Absolute Error (MAE):', mae)
print('R-squared (R2):', r2)

mae_rf = mae
r2_rf = r2

Mean Absolute Error (MAE): 414.61275525000013
R-squared (R2): 0.9821731219732335


In [None]:
model = DecisionTreeRegressor(random_state=1, max_depth=9)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Absolute Error (MAE):', mae)
print('R-squared (R2):', r2)

mae_dt = mae
r2_dt = r2

Mean Absolute Error (MAE): 901.3833401605414
R-squared (R2): 0.9651834467882999


In [126]:
model = GradientBoostingRegressor(random_state=1, n_estimators=2000, learning_rate=0.1, max_depth=4)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Absolute Error (MAE):', mae)
print('R-squared (R2):', r2)

mae_gb = mae
r2_gb = r2

Mean Absolute Error (MAE): 394.28616790005844
R-squared (R2): 0.985479928315043


In [128]:
# Membuat dictionary dengan nama model sebagai key dan nilai MAE dan R2 sebagai value
result_data = {
    'Model': ['Decision Tree', 'Linear Regression', 'Random Forest', 'Gradient Boosting'],
    'MAE': [mae_dt, mae_linear, mae_rf, mae_gb],
    'R2': [r2_dt, r2_linear, r2_rf, r2_gb]
}

comparison_table = pd.DataFrame(result_data)

print(comparison_table)
print('\nModel dengan nilai R2 terbesar dan MAE terkecil akan diambil sebagai final model yang akan di deploy menggunakan streamlit dan ngrok')

               Model          MAE        R2
0      Decision Tree   901.383340  0.965183
1  Linear Regression  6023.641140  0.198279
2      Random Forest   414.612755  0.982173
3  Gradient Boosting   394.286168  0.985480

Model dengan nilai R2 terbesar dan MAE terkecil akan diambil sebagai final model yang akan di deploy menggunakan streamlit dan ngrok
