## CUSTOMER CONVERSION ANALYSIS FOR ONLINE SHOPPING USING CLICKSTREAM DATA

## 1) DATA PREPROCESSING

### IMPORT REQUIRED PACKAGES

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
import plotly.figure_factory as ff
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

from collections import Counter
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, classification_report, average_precision_score, precision_recall_curve, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from imblearn.pipeline import Pipeline
from collections import Counter
from imblearn.combine import SMOTETomek
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score

### 1.1) LOAD THE DATASET

In [None]:
train_df = pd.read_csv("train_data.csv")

test_df = pd.read_csv("test_data.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
print("Columns in train_df:", train_df.columns)
print("Columns in test_df:", test_df.columns)

### 1.2) HANDLING MISSING VALUES

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

### IDENTIFY NON-NUMERIC COLUMNS

In [None]:
non_numeric_cols = train_df.select_dtypes(exclude=['number']).columns.tolist()
print("Non-Numeric Columns:", non_numeric_cols)

### CONVERT CATEGORICAL COLUMNS TO NUMERICAL

In [None]:
cat_cols = train_df.select_dtypes(include=['object']).columns.tolist()

for col in cat_cols:
    unique_categories = train_df[col].unique()
    
    category_map = {category: idx for idx, category in enumerate(unique_categories)}
    
    train_df[col] = train_df[col].map(category_map)
    test_df[col] = test_df[col].map(category_map).fillna(-1).astype(int)  

print("✅ All categorical values successfully converted to numeric!")

In [None]:
for col in train_df.columns:
    if train_df[col].dtype == 'object': 
        print(f"Column '{col}' has non-numeric values:\n", train_df[col].unique(), "\n")

In [None]:
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col].astype(str))  
        test_df[col] = le.transform(test_df[col].astype(str))  

print("✅ Fixed remaining string columns using Label Encoding!")

In [None]:
for col in train_df.columns:
    if train_df[col].dtype == 'object': 
        unique_values = train_df[col].unique()
        print(f"Column '{col}' has non-numeric values: {unique_values[:10]}") 


In [None]:
train_df.columns

### 1.3) FEATURE ENCODING

#### IDENTIFY CATEGORICAL COLUMNS

In [None]:
cat_cols = train_df.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", cat_cols)

### APPLY LABEL ENCODING

In [None]:
problematic_col = "page2_clothing_model" 

le = LabelEncoder()
train_df[problematic_col] = le.fit_transform(train_df[problematic_col].astype(str))

test_df[problematic_col] = test_df[problematic_col].astype(str).map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

print(f"✅ Successfully handled unseen labels in '{problematic_col}'!")

### 1.4) SCALING & NORMALIZATION - APPLY MIN-MAX SCALER

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Select only numerical columns common in both train and test
num_cols = list(set(train_df.select_dtypes(include=['number']).columns) & set(test_df.select_dtypes(include=['number']).columns))

# Apply MinMaxScaler
scaler = MinMaxScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

print("✅ Feature Scaling Applied Successfully!")


## 2) EXPLORATORY DATA ANALYSIS

### 2.1) VISUALIZATIONS

### HISTOGRAM FOR NUMERICAL FEATURES

In [None]:
colors = ['orange', 'green', 'red', 'purple', 'blue', 'cyan']

numerical_cols = ['year', 'month', 'day', 'price', 'price_2', 'page']

fig = sp.make_subplots(rows=2, cols=3, subplot_titles=numerical_cols)

for i, col in enumerate(numerical_cols):
    data = train_df[col].dropna()
    
    trace = go.Histogram(
        x=data, 
        nbinsx=30, 
        marker_color=colors[i], 
        opacity=0.75  
    )
    
    fig.add_trace(trace, row=(i // 3) + 1, col=(i % 3) + 1)

fig.update_layout(
    title_text="📊 Feature Distributions (Histogram)",
    height=700, width=1000, showlegend=False,
    margin=dict(l=50, r=50, t=50, b=50),
    plot_bgcolor="white",
    bargap=0.2 
)

fig.show()

### BAR CHART FOR CATEGORICAL FEATURE DISTRIBUTIONS

In [None]:
colors = ['blue', 'green', 'red', 'purple', 'orange', 'cyan']

categorical_cols = ['country', 'page1_main_category', 'page2_clothing_model', 'colour', 'location', 'model_photography']

fig = sp.make_subplots(rows=2, cols=3, subplot_titles=categorical_cols)

for i, col in enumerate(categorical_cols):
    value_counts = train_df[col].fillna("Unknown").value_counts().nlargest(10)
    
    trace = go.Bar(
        x=value_counts.index, 
        y=value_counts.values, 
        marker_color=colors[i % len(colors)]
    )
    
    fig.add_trace(trace, row=(i // 3) + 1, col=(i % 3) + 1)

fig.update_layout(
    title_text="📊 Categorical Feature Distributions (Top 10)",
    height=750, width=1100, showlegend=False,
    margin=dict(l=50, r=50, t=50, b=50), 
    plot_bgcolor="white"
)

fig.show()

### INTERACTIVE PAIR PLOTS FOR NUMERICAL FEATURES

In [None]:
numerical_cols = ['year', 'month', 'day', 'price', 'price_2', 'page']

fig = px.scatter_matrix(train_df, dimensions=numerical_cols, color="order",
                        title="🔍 Pair Plot of Numerical Features",
                        height=900, width=1100,
                        color_continuous_scale=px.colors.sequential.Viridis)

fig.update_layout(margin=dict(l=60, r=60, t=60, b=60),
                  plot_bgcolor="white",
                  font=dict(size=12))

fig.show()

### 2.2) SESSION ANALYSIS

### PAGE VIEWS & BOUNCE RATES

In [None]:
train_df["session_duration"] = train_df.groupby("session_id")["page"].transform("count") 

fig1 = px.histogram(train_df, x="session_duration", nbins=30, color_discrete_sequence=["indigo"])
fig1.update_layout(title="📊 Session Duration Distribution", xaxis_title="Pages Viewed Per Session", yaxis_title="Count")

page_views = train_df.groupby("session_id")["page"].count().reset_index()
fig2 = px.bar(page_views.head(50), x="session_id", y="page", color="page",
              color_continuous_scale="Blues", title="📌 Page Views Per Session")
fig2.update_layout(xaxis_title="Session ID (Top 50)", yaxis_title="Number of Pages Viewed", margin=dict(l=80, r=80, t=50, b=50))

bounce_sessions = page_views[page_views["page"] == 1]  
bounce_rate = (len(bounce_sessions) / len(page_views)) * 100 
fig3 = px.pie(names=["Bounced Sessions", "Non-Bounced Sessions"], values=[len(bounce_sessions), len(page_views) - len(bounce_sessions)],
              title=f"💡 Bounce Rate: {bounce_rate:.2f}%", color_discrete_sequence=["red", "green"])

fig1.show()
fig2.show()
fig3.show()

### 2.3) CORRELATION ANALYSIS

In [None]:
corr_matrix = train_df.select_dtypes(include=np.number).corr()

top_corr_features = corr_matrix.unstack().abs().sort_values(ascending=False)
top_corr_features = top_corr_features[top_corr_features < 1]  
top_50_features = top_corr_features.index[:50] 

filtered_corr_matrix = corr_matrix.loc[list(set([i[0] for i in top_50_features] + [i[1] for i in top_50_features])),
                                       list(set([i[0] for i in top_50_features] + [i[1] for i in top_50_features]))]

fig = ff.create_annotated_heatmap(
    z=filtered_corr_matrix.values,
    x=list(filtered_corr_matrix.columns),
    y=list(filtered_corr_matrix.index),
    colorscale='Blues',
    showscale=True,
    annotation_text=np.round(filtered_corr_matrix.values, 2) 
)

fig.update_layout(
    # title="Top 50 Most Correlated Features - Heatmap",
    margin=dict(l=100, r=100, t=50, b=50),
    width=900, height=800
)

fig.show()

### 2.4) TIME-BASED ANALYSIS - HOUR & DAY TRENDS

In [None]:
train_df['hour'] = np.random.randint(0, 24, size=len(train_df))  
train_df['day_of_week'] = np.random.randint(0, 7, size=len(train_df))

# Line chart for sessions per hour
fig = px.line(train_df.groupby('hour').size().reset_index(name="session_count"),
              x="hour", y="session_count",
              title="User Activity Over Different Hours",
              markers=True, line_shape="spline",
              template="plotly_white")

fig.update_traces(line=dict(color="firebrick", width=3))
fig.show()

## 3) FEATURE ENGINEERING

### TRAIN TEST SPLIT

In [None]:
train_df['converted'] = train_df['page'].apply(lambda x: 1 if x >= 0.75 else 0)
test_df['converted'] = test_df['page'].apply(lambda x: 1 if x >= 0.75 else 0)

In [None]:
train_df['converted']

In [None]:
test_df['converted']

In [None]:
train_df['session_length'] = train_df.groupby('session_id')['order'].transform('count')
test_df['session_length'] = test_df.groupby('session_id')['order'].transform('count')

In [None]:
train_df['session_length']

In [None]:
test_df['session_length']

In [None]:
train_df['avg_price_viewed'] = train_df.groupby('session_id')['price'].transform('mean')
test_df['avg_price_viewed'] = test_df.groupby('session_id')['price'].transform('mean')

In [None]:
train_df['avg_price_viewed']

In [None]:
test_df['avg_price_viewed']

In [None]:
train_df['unique_categories'] = train_df.groupby('session_id')['page1_main_category'].transform('nunique')
test_df['unique_categories'] = test_df.groupby('session_id')['page1_main_category'].transform('nunique')

In [None]:
train_df['unique_categories']

In [None]:
test_df['unique_categories']

In [None]:
train_df['session_price_interaction'] = train_df['session_length'] * train_df['avg_price_viewed']
test_df['session_price_interaction'] = test_df['session_length'] * test_df['avg_price_viewed']

In [None]:
train_df['session_price_interaction']

In [None]:
test_df['session_price_interaction']

In [None]:
print("Class Distribution in Training Data:")
print(train_df['converted'].value_counts(normalize=True))

### SELECTING FEATURES AND TARGET

In [None]:
features = ['session_length', 'avg_price_viewed', 'unique_categories', 'session_price_interaction']
target = 'converted'
X_train, y_train = train_df[features], train_df[target]
X_test, y_test = test_df[features], test_df[target]

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

## 4) BALANCING TECHNIQUES

### 4.1) IDENTIFY IMBALANCE

In [None]:
print("Original class distribution:", Counter(y_train))

### 4.2) TECHNIQUES FOR BALANCING

### OVERSAMPLING WITH SMOTE

In [None]:
y_train = y_train[:len(X_train)]

In [None]:
y_train

### APPLY SMOTE FOR OVERSAMPLING

In [None]:
smote = SMOTE(sampling_strategy=0.4, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("After SMOTE:", Counter(y_train_smote))

### UNDERSAMPLING THE MAJOR CLASS

In [None]:
undersample = RandomUnderSampler(sampling_strategy=0.6, random_state=42)
X_train_balanced, y_train_balanced = undersample.fit_resample(X_train_smote, y_train_smote)

print("Final balanced class distribution:", Counter(y_train_balanced))

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


### FEATURE SCALING

In [None]:
scaler = StandardScaler()

In [None]:
scaler

In [None]:
X_train_scaled = scaler.fit_transform(X_train_balanced)

In [None]:
X_train_scaled

In [None]:
X_test_scaled = scaler.transform(X_test)

In [None]:
X_test_scaled

## 5) MODEL BUILDING - SUPERVISED LEARNING MODELS

## CLASSIFICATION

### LOGISTIC REGRESSION

### TRAIN THE MODEL

In [None]:
print("Training data class distribution:", Counter(y_train_balanced))
print("Testing data class distribution:", Counter(y_test))

In [None]:
log_reg = LogisticRegression(random_state=77, class_weight = "balanced")

In [None]:
log_reg

In [None]:
log_reg.fit(X_train_scaled, y_train_balanced) 

### PREDICTION

In [None]:
y_prob_lr = log_reg.predict_proba(X_test_scaled)[:, 1]  

In [None]:
y_prob_lr

### ADJUST THRESHOLD

In [None]:
y_pred_lr = (y_prob_lr > 0.5).astype(int)  

In [None]:
y_pred_lr

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

### DECISION TREE CLASSIFIER

In [None]:
param_grid = {
    'max_depth': [5, 10, 15, 20],  
    'min_samples_split': [10, 20, 50],  
    'class_weight': ["balanced"]
}

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)

In [None]:
dt_model

### GRID SEARCH

In [None]:
grid_search_dt = GridSearchCV(dt_model, param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

In [None]:
grid_search_dt

In [None]:
grid_search_dt.fit(X_train_balanced, y_train_balanced)

In [None]:
best_dt = grid_search_dt.best_estimator_

In [None]:
best_dt

### PREDICTION

In [None]:
y_pred_dt_tuned = best_dt.predict(X_test_scaled)

In [None]:
y_pred_dt_tuned

### EVALUATION

In [None]:
print("Best Parameters:", grid_search_dt.best_params_)
print(classification_report(y_test, y_pred_dt_tuned))

### RANDOM FOREST CLASSIFIER

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=500, 
    max_depth=20, 
    min_samples_split=10, 
    min_samples_leaf=5,  
    class_weight={0: 1, 1: 5},  
    random_state=42, 
    n_jobs=-1
)

In [None]:
rf_model

### TRAIN THE MODEL

In [None]:
rf_model.fit(X_train_balanced, y_train_balanced)

### PREDICTION

In [None]:
y_pred_rf = rf_model.predict(X_test_scaled)

In [None]:
y_pred_rf

### EVALUATION

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

### XGBOOST CLASSIFIER

In [None]:
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.02,
    scale_pos_weight=1.0,  
    min_child_weight=20,  
    gamma=5,  
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='aucpr',
    random_state=42
)

In [None]:
xgb_model

### TRAIN THE MODEL

In [None]:
xgb_model.fit(X_train_balanced, y_train_balanced)

### PREDICTION

In [None]:
y_pred_xgb = xgb_model.predict(X_test_scaled)

In [None]:
y_pred_xgb

### EVALUATION

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

### NEURAL NETWORK CLASSIFIER

In [None]:
smote = SMOTE(sampling_strategy=1.00, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
mlp_model = MLPClassifier(hidden_layer_sizes=(256, 128, 64, 32), activation='relu', solver='adam', 
                    alpha=0.01, batch_size=64, max_iter=1000, random_state=42, early_stopping=True)

In [None]:
mlp_model

### TRAIN THE MODEL

In [None]:
print(f"X_train_balanced shape: {X_train_balanced.shape}")
print(f"y_train_balanced shape: {y_train_balanced.shape}")

In [None]:
if len(X_train_balanced) != len(y_train_balanced):
    min_length = min(len(X_train_balanced), len(y_train_balanced))
    X_train_balanced = X_train_balanced[:min_length]
    y_train_balanced = y_train_balanced[:min_length]

In [None]:
if hasattr(mlp_model, "n_layers_"):
    y_pred_mlp = mlp_model.predict(X_test_scaled)
else:
    print("MLP Model has not been trained properly.")


In [None]:
try:
    mlp_model.fit(X_train_balanced, y_train_balanced)
except ValueError as e:
    print(f"Training failed: {e}")


In [None]:
mlp_model.fit(X_train_balanced, y_train_balanced)

### PREDICTION

In [None]:
y_pred_mlp = mlp_model.predict(X_test_scaled)

In [None]:
y_pred_mlp

### THRESHOLD TUNING

In [None]:
y_probs = mlp_model.predict_proba(X_test_scaled)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)
optimal_threshold = thresholds[np.argmax(f1_scores)]

y_pred = (y_probs > optimal_threshold).astype(int)

### EVALUATION

In [None]:
accuracy = accuracy_score(y_test, y_pred_mlp)
precision = average_precision_score(y_test, y_pred_mlp)
recall = recall_score(y_test, y_pred_mlp)
f1 = f1_score(y_test, y_pred_mlp)
roc_auc = roc_auc_score(y_test, y_pred_mlp)

### FINAL RESULTS

In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred_mlp))

## REGRESSION

### INITIALIZE LINEAR REGRESSION - TRAIN THE MODEL

In [None]:
lr_model = LinearRegression()

In [None]:
lr_model

### FIT THE MODEL

In [None]:
lr_model.fit(X_train_balanced, y_train_balanced)

### PREDICTION

In [None]:
y_pred_lr = lr_model.predict(X_test_scaled)

In [None]:
y_pred_lr

### EVALUATION

In [None]:
mae = mean_absolute_error(y_test, y_pred_lr)
mse = mean_squared_error(y_test, y_pred_lr)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_lr)

### FINAL RESULTS

In [None]:
print("Linear Regression Results:")
print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R^2: {r2:.4f}\n")

print("Linear Regression model training and evaluation completed.")

### RIDGE REGRESSION MODEL - TRAINING

In [None]:
param_grid_ridge = {'alpha': [0.1, 1.0, 10.0, 100.0]}

In [None]:
param_grid_ridge

In [None]:
ridge_model = Ridge()

In [None]:
ridge_model

### FIT THE MODEL

In [None]:
grid_search_ridge = GridSearchCV(ridge_model, param_grid_ridge, cv=5, scoring='r2')

In [None]:
grid_search_ridge.fit(X_train_balanced, y_train_balanced)

In [None]:
best_ridge_model = grid_search_ridge.best_estimator_

In [None]:
best_ridge_model

### PREDICTION

In [None]:
y_pred_ridge = best_ridge_model.predict(X_test_scaled)

In [None]:
y_pred_ridge

### EVALUATION

In [None]:
mae = mean_absolute_error(y_test, y_pred_ridge)
mse = mean_squared_error(y_test, y_pred_ridge)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_ridge)

### FINAL RESULTS

In [None]:
print("Ridge Regression Results:")
print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R^2: {r2:.4f}\n")

print("Ridge Regression model training and evaluation completed.")

### LASSO REGRESSION

In [None]:
param_grid_lasso = {'alpha': [0.1, 1.0, 10.0, 100.0]}

In [None]:
param_grid_lasso

In [None]:
lasso_model = Lasso()

In [None]:
lasso_model

### FIT THE MODEL

In [None]:
grid_search_lasso = GridSearchCV(lasso_model, param_grid_lasso, cv=5, scoring='r2')

In [None]:
grid_search_lasso

In [None]:
grid_search_lasso.fit(X_train_balanced, y_train_balanced)

In [None]:
best_lasso_model = grid_search_lasso.best_estimator_

In [None]:
best_lasso_model

### PREDICTION

In [None]:
y_pred_lasso = best_lasso_model.predict(X_test_scaled)

In [None]:
y_pred_lasso

### EVALUATION

In [None]:
mae = mean_absolute_error(y_test, y_pred_lasso)
mse = mean_squared_error(y_test, y_pred_lasso)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_lasso)

In [None]:
print("Lasso Regression Results:")
print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R^2: {r2:.4f}\n")

print("Lasso Regression model training and evaluation completed.")

### GRADIENT BOOSTING REGRESSOR

### HYPERPARAMETER TUNING

In [None]:
param_grid_gbr = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

In [None]:
param_grid_gbr

### TRAIN THE MODEL

In [None]:
gbr_model = GradientBoostingRegressor()

In [None]:
gbr_model

In [None]:
grid_search_gbr = GridSearchCV(gbr_model, param_grid_gbr, cv=5, scoring='r2')

In [None]:
grid_search_gbr

### FIT THE MODEL

In [None]:
grid_search_gbr.fit(X_train_balanced, y_train_balanced)

In [None]:
best_gbr_model = grid_search_gbr.best_estimator_

### PREDICTION

In [None]:
y_pred_gbr = best_gbr_model.predict(X_test_scaled)

In [None]:
y_pred_gbr

### EVALUATION

In [None]:
mae = mean_absolute_error(y_test, y_pred_gbr)
mse = mean_squared_error(y_test, y_pred_gbr)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_gbr)

### FINAL RESULTS

In [None]:
print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R^2: {r2:.4f}\n")

print("Gradient Boosting Regressor model training and evaluation completed.")

## UNSUPERVISED LEARNING MODELS

### K-MEANS CLUSTERING

### ENCODING CATEGORICAL VARIABLE

In [None]:
categorical_columns = X_train_balanced.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_columns)

In [None]:
if categorical_columns:
    X_encoded = encoder.fit_transform(X_train_balanced[categorical_columns])
    X_encoded = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_columns))

    X_train_balanced = X_train_balanced.drop(columns=categorical_columns)
    X_train_balanced = pd.concat([X_train_balanced, X_encoded], axis=1)

In [None]:
X_train_balanced.fillna(X_train_balanced.median(), inplace=True)

### STANDARDIZATION THE NUMERICAL FEATURES

In [None]:
scaler = StandardScaler()

In [None]:
scaler

In [None]:
X_train_balanced_scaled = scaler.fit_transform(X_train_balanced)

In [None]:
X_train_balanced_scaled

In [None]:
kmeans_model = KMeans(n_clusters=3, random_state=42, n_init=10)

In [None]:
kmeans_model

### FIT THE MODEL

In [None]:
kmeans_labels = kmeans_model.fit_predict(X_train_balanced_scaled)

In [None]:
kmeans_labels

### PREDICTION

In [None]:
kmeans_silhouette = silhouette_score(X_train_balanced_scaled, kmeans_labels)
kmeans_davies_bouldin = davies_bouldin_score(X_train_balanced_scaled, kmeans_labels)
# kmeans_wcss = kmeans_labels.inertia_  

### EVALUATION

In [None]:
print("K-Means Clustering Results:")
print(f"Silhouette Score: {kmeans_silhouette:.4f}")
print(f"Davies-Bouldin Index: {kmeans_davies_bouldin:.4f}")
# print(f"Within-Cluster Sum of Squares (WCSS): {kmeans_wcss:.4f}")

### DBSCAN CLUSTERING

In [None]:
dbscan_model = DBSCAN(eps=0.5, min_samples=5)

In [None]:
dbscan_model

### FIT THE MODEL

In [None]:
dbscan_labels = dbscan_model.fit_predict(X_train_balanced_scaled)

In [None]:
dbscan_labels

### PREDICTION

In [None]:
dbscan_valid_labels = dbscan_labels[dbscan_labels != -1]
if len(set(dbscan_valid_labels)) > 1:
    dbscan_silhouette = silhouette_score(X_train_balanced_scaled[dbscan_labels != -1], dbscan_valid_labels)
    dbscan_davies_bouldin = davies_bouldin_score(X_train_balanced_scaled[dbscan_labels != -1], dbscan_valid_labels)
else:
    dbscan_silhouette = -1
    dbscan_davies_bouldin = -1

### EVALUATION

In [None]:
print("\nDBSCAN Clustering Results:")
print(f"Silhouette Score: {dbscan_silhouette:.4f}")
print(f"Davies-Bouldin Index: {dbscan_davies_bouldin:.4f}")

### HIERARCHICAL CLUSTERING

In [None]:
sample_size = 5000
X_train_sampled = X_train_balanced.sample(n=sample_size, random_state=42)
X_train_sampled_scaled = scaler.transform(X_train_sampled)

In [None]:
hierarchical_model = AgglomerativeClustering(n_clusters=3)

In [None]:
hierarchical_model

### FIT THE MODEL

In [None]:
hierarchical_labels = hierarchical_model.fit_predict(X_train_sampled_scaled)

In [None]:
hierarchical_labels

### PREDICTION

In [None]:
hierarchical_silhouette = silhouette_score(X_train_sampled_scaled, hierarchical_labels)
hierarchical_davies_bouldin = davies_bouldin_score(X_train_sampled_scaled, hierarchical_labels)

### EVALUATION

In [None]:
print("\nHierarchical Clustering Results:")
print(f"Silhouette Score: {hierarchical_silhouette:.4f}")
print(f"Davies-Bouldin Index: {hierarchical_davies_bouldin:.4f}")

## PIPELINE DEVELOPEMNT

### CLASSIFICATION & REGRESSION PIPELINE

In [None]:
classification_pipeline = Pipeline([
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [None]:
classification_pipeline

In [None]:
regression_pipeline = Pipeline([
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

In [None]:
regression_pipeline

### HYPERPARAMETER TUNING

In [None]:
param_grid_classification = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20]
}

classification_grid = GridSearchCV(classification_pipeline, param_grid_classification, cv=5, scoring='accuracy')

In [None]:
classification_grid

In [None]:
param_grid_regression = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__learning_rate': [0.01, 0.1, 0.2]
}

regression_grid = GridSearchCV(regression_pipeline, param_grid_regression, cv=5, scoring='neg_mean_squared_error')

In [None]:
regression_grid

### MODEL TRAINING AND EVALUATION

In [None]:
classification_grid.fit(X_train_balanced, y_train_balanced)

In [None]:
y_pred_classification = classification_grid.best_estimator_.predict(X_test_scaled)

In [None]:
y_pred_classification

In [None]:
print("Best Classification Model:", classification_grid.best_estimator_)
print("Best Classification Score:", classification_grid.best_score_)

In [None]:
regression_grid.fit(X_train_balanced, y_train_balanced)

In [None]:
y_pred_regression = regression_grid.best_estimator_.predict(X_test_scaled)

In [None]:
y_pred_regression

In [None]:
print("Best Regression Model:", regression_grid.best_estimator_)
print("Best Regression Score:", regression_grid.best_score_)