In [None]:
import os
from typing import Optional, Tuple
from collections import Counter
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import calendar

In [None]:
# Load the datasets using the read_csv() method. You can specify the separator used in the csv file. By default, the separator is ","
products_df = pd.read_csv('product_inf_2000.csv')
client_df = pd.read_csv('client_inf_2000.csv',sep=';')
transactions_df = pd.read_csv('transac_inf_2000.csv', sep=',')

In [None]:
products = pd.read_csv('product(1).csv')
client = pd.read_csv('client(1).csv', sep=';')
transactions = pd.read_csv('transac(1).csv', sep=';')

In [None]:
# Concaténation des DataFrames
combined_products = pd.concat([products, products_df], ignore_index=True)

# Harmonisation des types de colonnes selon le plus grand DataFrame
for col in products_df.columns:
    combined_products[col] = combined_products[col].astype(products_df[col].dtype)

# Vérification des informations du DataFrame combiné
print(combined_products.info())

# Optionnel : sauvegarde en CSV
combined_products.to_csv('combined_products.csv', index=False, sep=',', na_rep='')

In [None]:
# Concaténation des DataFrames
combined_client = pd.concat([client, client_df], ignore_index=True)

# Harmonisation des types de colonnes selon client_df (le plus grand DataFrame)
for col in client_df.columns:
    combined_client[col] = combined_client[col].astype(client_df[col].dtype)

# Vérification des informations du DataFrame combiné
print(combined_client.info())

# Optionnel : sauvegarde en CSV
combined_client.to_csv('combined_client.csv', index=False, sep=',', na_rep='')

In [None]:
# Renommer la colonne 'website_version' en 'country' pour uniformiser
transactions_df = transactions_df.rename(columns={'website_version': 'country'})

# Concaténation
combined_transactions = pd.concat([transactions, transactions_df], ignore_index=True)

# Harmonisation des types selon transactions_df
for col in transactions_df.columns:
    combined_transactions[col] = combined_transactions[col].astype(transactions_df[col].dtype)

# Vérification
print(combined_transactions.info())

# Optionnel : sauvegarde en CSV
combined_transactions.to_csv('combined_transactions.csv', index=False, sep=',', na_rep='')

In [None]:
def get_month_and_year(week_string):
    # Extract the year and week number from the input string
    year_str = week_string[1:5]
    week_num_str = week_string[5:]

    # Convert the year and week number to integers
    year = int(year_str)
    week_num = int(week_num_str)

    # Extract the month and year of every day of the week and chose the average month and year
    months, years = [], []
    try:
            # Handle week 0 (January of the given year)
            if week_num == 0:
                day = datetime.date.fromisocalendar(year, 1, 1)  # First day of the year
                months.append(day.month)
                years.append(day.year)
    
            # Handle valid week numbers (1 to 52, or 53 if valid)
            else:
                for i in range(1, 8):  # Iterate over the days of the week
                    day = datetime.date.fromisocalendar(year, week_num, i)
                    months.append(day.month)
                    years.append(day.year)
    
    except ValueError:
            # Handle invalid week numbers (like non-existent week 53)
            last_day_of_year = datetime.date(year, 12, 31)
            months.append(last_day_of_year.month)
            years.append(last_day_of_year.year)
        
    average_month = Counter(months).most_common(1)[0][0]
    average_year = Counter(years).most_common(1)[0][0]

    # Convert the month to the corresponding label
    month = calendar.month_name[average_month]

    return month[:3] + '-' + str(average_year)

#Example
week_string = "W202148"
print(get_month_and_year(week_string))

In [None]:
combined_client['week'] = combined_client['week'].apply(get_month_and_year)
combined_client.head(2)

In [None]:
combined_transactions['week'] = combined_transactions['week'].apply(get_month_and_year)
combined_transactions.head(2)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from lifetimes import BetaGeoFitter, GammaGammaFitter
from lifetimes.utils import summary_data_from_transaction_data
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans



In [None]:
colors = ['#D4C4B7',    # beige
          '#8B4513',    # saddle brown
          '#2F1810']    # dark brown/black

sales_by_store = combined_transactions.groupby(['week', 'store_type_label'])['product_quantity'].sum().reset_index()

plt.figure(figsize=(14, 6))
sns.lineplot(data=sales_by_store, 
            x='week', 
            y='product_quantity', 
            hue='store_type_label',
            palette=colors)

plt.title('Évolution des ventes Web vs Magasins Physiques', fontsize=12, pad=15)
plt.xlabel('Semaine', fontsize=10)
plt.ylabel('Quantité de produits vendus', fontsize=10)

# Rotate x-axis labels 45 degrees
plt.xticks(rotation=45, ha='right')  # ha='right' aligns the rotated labels

plt.legend(title='Type de magasin', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.2)

# Adjust layout to prevent label cutoff
plt.tight_layout()
plt.show()

In [None]:
# First, create a mask for Web sales
web_sales = combined_transactions[combined_transactions['store_type_label'] == 'Web']

# Group by week for Web sales
sales_by_store = web_sales.groupby('week')['product_quantity'].sum().reset_index()

# Create the visualization
colors = ['#8B4513']    # Using brown for web sales

plt.figure(figsize=(14, 6))
sns.lineplot(data=sales_by_store, 
            x='week', 
            y='product_quantity',
            color=colors[0],
            linewidth=2)

plt.title('Évolution des ventes Web', fontsize=12, pad=15)
plt.xlabel('Semaine', fontsize=10)
plt.ylabel('Quantité de produits vendus', fontsize=10)
plt.xticks(rotation=45, ha='right')

plt.grid(True, alpha=0.2)
plt.gca().set_facecolor('#FAF9F6') 

plt.tight_layout()
plt.show()

In [None]:
# clustering_models.py
class ClusteringModel:
    def __init__(self):
        self.scaler = StandardScaler()
    
    def prepare_clustering_data(self, combined_client, combined_transactions, combined_products):
    
    # Agrégation des transactions par client
        client_purchase_history = combined_transactions.groupby('week').agg({
        'count_distinct_transaction': 'sum',
        'product_quantity': 'sum'
        }).reset_index()
    
    # Calcul des variables pour le clustering
        client_features = pd.DataFrame()
        client_features['total_transactions'] = combined_client.groupby('clients')['items_bought'].sum()
        client_features['frequency'] = combined_client.groupby('clients')['week'].nunique()
    
    # Gestion des valeurs manquantes
        client_features = client_features.fillna(0)
    
    # Standardisation des features
        features_scaled = self.scaler.fit_transform(client_features)
        return pd.DataFrame(features_scaled, columns=client_features.columns)

    def visualize_clusters(self, data, clusters, method='kmeans'):
        plt.figure(figsize=(10, 6))
        scatter = plt.scatter(data.iloc[:, 0], data.iloc[:, 1], c=clusters, cmap='viridis')
        plt.title(f'Visualisation des clusters ({method})')
        plt.colorbar(scatter)
        plt.show()

In [None]:
# kmeans_model.py
class KMeansModel(ClusteringModel):
    def __init__(self):
        super().__init__()
        self.kmeans = None
        
    def cluster(self, data, n_clusters=3):
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=1234)
        clusters = self.kmeans.fit_predict(data)
        
        silhouette_avg = silhouette_score(data, clusters)
        print(f"Score silhouette K-means: {silhouette_avg}")
        
        return clusters

In [None]:
class DBSCANModel(ClusteringModel):
    def __init__(self):
        super().__init__()
        self.dbscan = None
    
    def cluster(self, data, eps=0.3, min_samples=3): 
    
        self.dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        clusters = self.dbscan.fit_predict(data)
        
        n_clusters = len(set(clusters[clusters != -1]))
        
        print(f"Number of clusters found: {n_clusters}")
        print(f"Number of noise points: {sum(clusters == -1)}")
        
        mask = clusters != -1
        if n_clusters >= 2 and mask.any():
            try:
                silhouette_avg = silhouette_score(data[mask], clusters[mask])
                print(f"Score silhouette DBSCAN: {silhouette_avg}")
            except Exception as e:
                print(f"Could not calculate silhouette score: {str(e)}")
        else:
            print("Not enough clusters found for silhouette score calculation")
            if eps > 0.1:
                print("Attempting with smaller eps...")
                return self.cluster(data, eps=eps/2, min_samples=min_samples)
        
        return clusters

In [None]:
combined_transactions.head(2)

In [None]:
summary_data_from_transaction_data(combined_transactions,  'week', 'product_quantity').head(2)

In [None]:
# clv_models.py
class CLVModel:
    def prepare_clv_data(self, combined_transactions):
        return summary_data_from_transaction_data(
            combined_transactions,
            'clients',
            'week',
            'product_quantity',
            'count_distinct_transaction'
        )

    def visualize_clv_distribution(self, predicted_clv):
        plt.figure(figsize=(10, 6))
        sns.histplot(predicted_clv, bins=50)
        plt.title('Distribution des CLV prédits')
        plt.xlabel('CLV prédit')
        plt.ylabel('Fréquence')
        plt.show()

In [None]:
# bgnbd_model.py
class BGNBDModel(CLVModel):
    def __init__(self):
        self.bgf = None
        
    def fit(self, summary_data):
        self.bgf = BetaGeoFitter(penalizer_coef=0.0)
        self.bgf.fit(
            summary_data['frequency'],
            summary_data['recency'],
            summary_data['T']
        )
        
        predicted_purchases = self.bgf.predict(
            summary_data['frequency'],
            summary_data['recency'],
            summary_data['T'],
            30
        )
        mse = np.mean((predicted_purchases - summary_data['frequency']) ** 2)
        print(f"MSE du modèle BG/NBD: {mse}")
        
        return predicted_purchases

In [None]:
import numpy as np
# gamma_gamma_model.py
class GammaGammaModel(CLVModel):
    def __init__(self):
        self.ggf = None
        
    def fit(self, summary_data):
        self.ggf = GammaGammaFitter(penalizer_coef=0.0)
        self.ggf.fit(
            summary_data['frequency'],
            summary_data['monetary_value']
        )
        
        predicted_clv = self.ggf.conditional_expected_average_profit(
            summary_data['frequency'],
            summary_data['monetary_value']
        )
        mse = np.mean((predicted_clv - summary_data['monetary_value']) ** 2)
        print(f"MSE du modèle Gamma-Gamma: {mse}")
        
        return predicted_clv

In [None]:
# main.py
def main():
    # Chargement des données
    client_data = pd.read_csv('combined_client.csv')
    transaction_data = pd.read_csv('combined_transactions.csv')
    product_data = pd.read_csv('combined_products.csv')
    
    # Clustering avec KMeans
    kmeans = KMeansModel()
    clustering_data = kmeans.prepare_clustering_data(client_data, transaction_data, product_data)
    kmeans_clusters = kmeans.cluster(clustering_data)
    kmeans.visualize_clusters(clustering_data, kmeans_clusters, 'kmeans')
    
    # Clustering avec DBSCAN
    dbscan = DBSCANModel()
    dbscan_clusters = dbscan.cluster(clustering_data)
    dbscan.visualize_clusters(clustering_data, dbscan_clusters, 'dbscan')
    
    # Analyse CLV avec BG/NBD
    bgnbd = BGNBDModel()
    clv_data = bgnbd.prepare_clv_data(transaction_data)
    predicted_frequency = bgnbd.fit(clv_data)
    
    # Analyse CLV avec Gamma-Gamma
    gamma = GammaGammaModel()
    predicted_clv = gamma.fit(clv_data)
    gamma.visualize_clv_distribution(predicted_clv)

if __name__ == "__main__":
    main()