## Package importation

In [None]:
import configparser
import pandas as pd
import os
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

## Data exploration

### Preparing datasets for analysis

In [None]:
#initialize config
config = configparser.ConfigParser()
config.read('../cfg/config.ini')

In [None]:
#loading datasets
data_dir = config['FILES']['data_dir']
dfs = {name: pd.read_csv(os.path.join(data_dir, file)).drop(columns=['index'], errors='ignore')
       for name, file in config['FILES'].items() if name != 'data_dir'}

In [None]:
#printing main infos
for name, df in dfs.items():
    print(f"Dataset '{name}':")
    print(f"- Shape: {df.shape}")
    print(f"- Columns: {df.columns.tolist()}")
    print("- First rows:")
    print(df.head(), '\n')

### Merging datasets

In [None]:
#merge datasets focusing on customer-centric analysis
customer_orders_df = dfs['customers'].merge(dfs['orders'], on='customer_id', how='left')
customer_orders_df = customer_orders_df.merge(dfs['order_items'], on='order_id', how='left')
customer_orders_df = customer_orders_df.merge(dfs['order_pymts'], on='order_id', how='left')
customer_orders_df = customer_orders_df.merge(dfs['order_reviews'][['order_id', 'review_score']], on='order_id', how='left')

#selecting coherent variables
selected_columns = [
    'customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state',
    'order_id', 'order_status', 'order_purchase_timestamp', 'order_delivered_customer_date', 'order_estimated_delivery_date',
    'product_id', 'price', 'freight_value', 'payment_type', 'payment_installments', 'payment_value', 'review_score'
]
customer_orders_df = customer_orders_df[selected_columns]

In [None]:
#display information about the customer-centric merged dataset
print(f"Customer-centric Merged Dataset:")
print(f"- Shape: {customer_orders_df.shape}")
print(f"- Columns: {customer_orders_df.columns.tolist()}")
print("- First rows:")
print(customer_orders_df.head())

### Handling data fluctuations

In [None]:
#handle missing values
for col in ['price', 'freight_value', 'payment_installments', 'payment_value', 'review_score']:
    customer_orders_df[col].fillna(customer_orders_df[col].median(), inplace=True)

for col in ['customer_state', 'payment_type']:
    customer_orders_df[col].fillna(customer_orders_df[col].mode()[0], inplace=True)

#drop rows with any remaining NaN
customer_orders_df.dropna(inplace=True)

## Applying clustering

In [None]:
#feature preparation
numerical_features = ['price', 'freight_value', 'payment_installments', 'payment_value', 'review_score']
categorical_features = ['customer_state', 'payment_type']

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [None]:
X = preprocessor.fit_transform(customer_orders_df)
random_state = 42

In [None]:
#function for clustering evaluation and visualization
def evaluate_and_visualize(X, labels, title):
    silhouette_avg = silhouette_score(X, labels)
    print(f"{title} Silhouette Score: {silhouette_avg:.3f}")
    X_embedded = TSNE(n_components=2, random_state=42).fit_transform(X)
    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=labels, cmap='viridis', s=50)
    plt.title(f"T-SNE visualization of {title}")
    plt.xlabel("TSNE Component 1")
    plt.ylabel("TSNE Component 2")
    plt.show()

### KMeans

In [None]:
kmeans = KMeans(n_clusters=5, random_state=random_state)
labels_kmeans = kmeans.fit_predict(X)
evaluate_and_visualize(X, labels_kmeans, "KMeans clusters")

### GMM

In [None]:
gmm = GaussianMixture(n_components=5, random_state=random_state)
labels_gmm = gmm.fit_predict(X)
evaluate_and_visualize(X, labels_gmm, "GMM clusters")

### Hierarchical clustering

In [None]:
agglo = AgglomerativeClustering(n_clusters=5)
labels_agglo = agglo.fit_predict(X)
evaluate_and_visualize(X, labels_agglo, "Hierarchical clusters")

### DBSCAN

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels_dbscan = dbscan.fit_predict(X)
evaluate_and_visualize(X, labels_dbscan, "DBSCAN clusters")
