In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
#from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from umap import UMAP
import pickle
import joblib


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from scripts.data_cleaner import filter_top_cpv_categories
from scripts.preprocess_pipeline import create_pipeline_cat


In [3]:
data_path='../data/data_clean.csv'
model_save_path='../data'
top_n = 40
cpv_column = 'codeCPV_2'

In [4]:
df = pd.read_csv(data_path, encoding='utf-8')

In [5]:
df_cpv = filter_top_cpv_categories(df, top_n=40, cpv_column=cpv_column)

Filtered from 73 to 40 CPV categories, keeping 283975 rows out of 286850


In [6]:
df_cpv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 283975 entries, 0 to 286849
Data columns (total 38 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   uid                        283975 non-null  object 
 1   id                         283975 non-null  object 
 2   nature                     283975 non-null  object 
 3   acheteur_id                283975 non-null  int64  
 4   acheteur_nom               283788 non-null  object 
 5   acheteur_siren             283810 non-null  float64
 6   titulaire_id               283975 non-null  object 
 7   titulaire_typeIdentifiant  283975 non-null  object 
 8   titulaire_nom              276865 non-null  object 
 9   titulaire_siren            282918 non-null  float64
 10  objet                      283975 non-null  object 
 11  montant                    283975 non-null  float64
 12  codeCPV                    283975 non-null  object 
 13  procedure                  283975 

In [7]:
X_train, X_test = train_test_split(df_cpv, test_size=0.2, random_state=0, stratify=df_cpv[cpv_column])

In [8]:
pipeline = create_pipeline_cat('marche_sim')
X_train_preproc = pipeline.fit_transform(X_train)
X_test_preproc = pipeline.transform(X_test)



Utiliser data_cleaner

In [None]:
def create_market_similarity_model():
    """
    Create a market similarity model based on GMM clustering without saving it
    """
    pca = PCA(n_components=0.9)  # Keep 90% of variance
    X_train_pca = pca.fit_transform(X_train_preproc)
    X_test_pca = pca.transform(X_test_preproc)

    #Create UMAP embedding for visualization (optional for production model)
    umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.01, random_state=0)
    X_train_umap = umap_model.fit_transform(X_train_pca)

    #Apply Gaussian Mixture Model clustering (best from exploratory analysis)
    gmm = GaussianMixture(
        n_components=20,  # Optimal number from analysis
        covariance_type='full',
        random_state=0,
        max_iter=100
    )
    gmm.fit(X_train_pca)

    # Get cluster labels
    train_labels = gmm.predict(X_train_pca)

    # Create cluster profiles for interpretation
    X_train_with_clusters = X_train.copy()
    X_train_with_clusters['cluster'] = train_labels

    # Generate profiles for each cluster
    profiles = create_cluster_profiles(X_train_with_clusters, cpv_column)

    # Create and return model artifacts without saving
    model_artifacts = {
        'preproc_pipeline': pipeline,
        'pca_model': pca,
        'umap_model': umap_model,  # For visualization
        'gmm_model': gmm,
        'cluster_profiles': profiles,
        'cpv_column': cpv_column  # Save which CPV column was used
    }

    # Just comment out the saving part
    # import joblib
    # for name, artifact in model_artifacts.items():
    #     joblib.dump(artifact, f"{model_save_path}/{name}.pkl")

    return model_artifacts

In [10]:

def create_cluster_profiles(data_with_clusters, cpv_column='codeCPV_2'):
    """
    Create detailed profiles for each cluster

    Parameters:
    -----------
    data_with_clusters : DataFrame
        Dataset with cluster assignments
    cpv_column : str, default='codeCPV_2'
        CPV code column to use for analysis

    Returns:
    --------
    DataFrame
        Profile information for each cluster
    """
    profiles = []

    for cluster_id in np.sort(np.unique(data_with_clusters['cluster'])):
        cluster_data = data_with_clusters[data_with_clusters['cluster'] == cluster_id]

        # Skip very small clusters
        if len(cluster_data) < 20:
            continue

        # Calculate key metrics
        profile = {
            'cluster_id': cluster_id,
            'size': len(cluster_data),
            'pct_total': len(cluster_data) / len(data_with_clusters) * 100,

            # CPV characteristics using the specified CPV column
            'top_cpv': cluster_data[cpv_column].value_counts().index[0] if cpv_column in cluster_data else None,
            'top_cpv_pct': cluster_data[cpv_column].value_counts().iloc[0] / len(cluster_data) * 100 if cpv_column in cluster_data else None,
            'cpv_diversity': len(cluster_data[cpv_column].unique()) if cpv_column in cluster_data else None,
            'cpv_column': cpv_column,  # Store which CPV column was used

            # Financial characteristics
            'mean_amount': cluster_data['montant'].mean() if 'montant' in cluster_data else None,
            'median_amount': cluster_data['montant'].median() if 'montant' in cluster_data else None,
            'amount_std': cluster_data['montant'].std() if 'montant' in cluster_data else None,

            # Duration characteristics
            'mean_duration': cluster_data['dureeMois'].mean() if 'dureeMois' in cluster_data else None,
            'median_duration': cluster_data['dureeMois'].median() if 'dureeMois' in cluster_data else None,
        }

        # Add derived metrics
        if 'montant' in cluster_data and 'dureeMois' in cluster_data:
            profile['euro_per_month'] = profile['median_amount'] / profile['median_duration'] if profile['median_duration'] > 0 else profile['median_amount']

        profiles.append(profile)

    return pd.DataFrame(profiles).sort_values('size', ascending=False)


In [None]:

def find_similar_markets(new_market_data, model_artifacts, n_similar=5):
    """
    Find similar markets to a new market description

    Parameters:
    -----------
    new_market_data : dict or DataFrame
        Data describing the new market
    model_artifacts : dict
        Dictionary containing model components
    n_similar : int
        Number of similar markets to return

    Returns:
    --------
    DataFrame
        Similar markets from the training data
    """
    # Get the CPV column that was used in training
    cpv_column = model_artifacts.get('cpv_column', 'codeCPV_2')

    # 1. Convert input to DataFrame if needed
    if isinstance(new_market_data, dict):
        new_market_data = pd.DataFrame([new_market_data])

    # Ensure the expected CPV column is present
    if cpv_column not in new_market_data.columns:
        raise ValueError(f"Input data must contain '{cpv_column}' column that was used in training")

    # 2. Preprocess the new data
    pipeline = model_artifacts['preproc_pipeline']
    new_market_preproc = pipeline.transform(new_market_data)

    # 3. Apply PCA
    pca = model_artifacts['pca_model']
    new_market_pca = pca.transform(new_market_preproc)

    # 4. Predict cluster
    gmm = model_artifacts['gmm_model']
    cluster_probs = gmm.predict_proba(new_market_pca)
    cluster_id = gmm.predict(new_market_pca)[0]

    # 5. Get profile for this cluster
    profiles = model_artifacts['cluster_profiles']
    cluster_profile = profiles[profiles['cluster_id'] == cluster_id].iloc[0].to_dict()

    # 6. Return profile information and similar markets
    return {
        'cluster_id': cluster_id,
        'cluster_profile': cluster_profile,
        'cluster_probability': np.max(cluster_probs[0]),
        'similar_markets': get_markets_from_cluster(cluster_id, n_similar, cpv_column)
    }


In [None]:
def get_markets_from_cluster(cluster_id, n_samples=5, cpv_column='codeCPV_2'):
    """
    Get sample markets from a specific cluster

    Parameters:
    -----------
    cluster_id : int
        Cluster ID to retrieve markets from
    n_samples : int
        Number of sample markets to return
    cpv_column : str
        CPV column used in clustering

    Returns:
    --------
    DataFrame
        Sample markets from the specified cluster
    """
    # Create a DataFrame with cluster assignments if not already stored
    if not hasattr(get_markets_from_cluster, 'clustered_data'):
        # Apply same preprocessing as in model creation
        X_train_pca = model_artifacts['pca_model'].transform(X_train_preproc)
        train_labels = model_artifacts['gmm_model'].predict(X_train_pca)

        # Store the data with cluster assignments
        clustered_data = X_train.copy()
        clustered_data['cluster'] = train_labels
        get_markets_from_cluster.clustered_data = clustered_data

    # Get data for the requested cluster
    cluster_data = get_markets_from_cluster.clustered_data[
        get_markets_from_cluster.clustered_data['cluster'] == cluster_id
    ]

    # If cluster is empty or too small
    if len(cluster_data) < n_samples:
        n_samples = max(1, len(cluster_data))
        print(f"Warning: Cluster {cluster_id} has only {len(cluster_data)} samples.")

    # Sample n_samples from this cluster
    if len(cluster_data) > 0:
        samples = cluster_data.sample(min(n_samples, len(cluster_data)))

        # Format the result for better display
        result = []
        for _, row in samples.iterrows():
            market = {
                'id': row.get('id', 'N/A'),  # Assuming there's an ID column
                cpv_column: row[cpv_column],
                'montant': row['montant'],
                'dureeMois': row['dureeMois'],
                'categorieAcheteur': row.get('categorieAcheteur', 'N/A'),
                'typeContrat': row.get('typeContrat', 'N/A')
            }
            result.append(market)

        return result
    else:
        return []

In [15]:
model_artifacts = create_market_similarity_model()

  warn(


In [18]:
cluster_profiles = model_artifacts['cluster_profiles']

In [19]:
cluster_profiles

Unnamed: 0,cluster_id,size,pct_total,top_cpv,top_cpv_pct,cpv_diversity,cpv_column,mean_amount,median_amount,amount_std,mean_duration,median_duration,euro_per_month
13,13,33528,14.758341,45000000,96.432832,2,codeCPV_2,229251.2,105833.615,692818.9,12.431192,12.0,8819.467917
5,5,27075,11.917862,90000000,8.217913,39,codeCPV_2,777990.9,160000.0,2830488.0,38.102936,48.0,3333.333333
9,9,23789,10.471432,15000000,9.861701,39,codeCPV_2,1197815.0,252014.0,4075112.0,41.959393,48.0,5250.291667
15,15,17897,7.877894,71000000,11.342683,39,codeCPV_2,509945.1,97125.0,2370086.0,33.826507,36.0,2697.916667
17,17,15122,6.656396,45000000,96.422431,2,codeCPV_2,1215287.0,240000.0,3388323.0,20.545893,14.0,17142.857143
11,11,15100,6.646712,45000000,100.0,1,codeCPV_2,2670983.0,600000.0,5743707.0,28.252384,24.0,25000.0
12,12,14823,6.524782,71000000,84.530797,10,codeCPV_2,214389.8,90337.5,541796.9,27.571005,24.0,3764.0625
4,4,12757,5.615371,71000000,20.412323,39,codeCPV_2,659875.2,137899.39,2753037.0,35.116407,39.0,3535.881795
2,2,12045,5.301963,45000000,72.4533,30,codeCPV_2,299691.6,118000.0,970076.6,17.33956,12.0,9833.333333
7,7,8896,3.915838,45000000,100.0,1,codeCPV_2,858753.5,354749.9,2092921.0,15.483138,11.0,32249.990909


In [20]:
cluster_id = 0  # Or any other cluster ID
cluster_info = cluster_profiles[cluster_profiles['cluster_id'] == cluster_id]
print(f"Cluster {cluster_id} profile:")
print(cluster_info)

Cluster 0 profile:
   cluster_id  size  pct_total   top_cpv  top_cpv_pct  cpv_diversity  \
0           0  2525   1.111453  45000000    23.762376             30   

  cpv_column   mean_amount  median_amount    amount_std  mean_duration  \
0  codeCPV_2  1.759914e+06       400000.0  4.593366e+06      39.537426   

   median_duration  euro_per_month  
0             48.0     8333.333333  


In [25]:
cluster_profiles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, 13 to 16
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cluster_id       20 non-null     int64  
 1   size             20 non-null     int64  
 2   pct_total        20 non-null     float64
 3   top_cpv          20 non-null     int64  
 4   top_cpv_pct      20 non-null     float64
 5   cpv_diversity    20 non-null     int64  
 6   cpv_column       20 non-null     object 
 7   mean_amount      20 non-null     float64
 8   median_amount    20 non-null     float64
 9   amount_std       20 non-null     float64
 10  mean_duration    20 non-null     float64
 11  median_duration  20 non-null     float64
 12  euro_per_month   20 non-null     float64
dtypes: float64(8), int64(4), object(1)
memory usage: 2.2+ KB


In [35]:
# Get the first observation from your test dataset
test_market = X_test.iloc[3].to_dict()

# You can modify specific values if needed
#test_market['montant'] = 250000  # Customize the amount
#test_market['dureeMois'] = 12    # Customize the duration

# Test the model with this observation
similar = find_similar_markets(test_market, model_artifacts)
print(f"\nTest Market - CPV: {test_market['codeCPV_2']}")
print(f"Assigned to cluster: {similar['cluster_id']}")
print(f"Probability: {similar['cluster_probability']:.2f}")
print(f"Most similar contracts are in category: {similar['cluster_profile']['top_cpv']}")


Test Market - CPV: 32000000
Assigned to cluster: 15
Probability: 1.00
Most similar contracts are in category: 71000000




In [36]:
X_test.iloc[3]

uid                                                200073427002012025213202503
id                                                               2025213202503
nature                                                                  Marché
acheteur_id                                                     20007342700201
acheteur_nom                 COMMUNAUTE DE COMMUNES MARCHES DU VELAY-ROCHEB...
acheteur_siren                                                     200073427.0
titulaire_id                                                    33652023400012
titulaire_typeIdentifiant                                                SIRET
titulaire_nom                                                    BOUCHARDON TP
titulaire_siren                                                    336520234.0
objet                                                    OUEST BOIS DE LA FAYE
montant                                                                88965.0
codeCPV                                             