# Installation

In [47]:
!pip install nltk
!pip install pandas
!pip install matplotlib
!pip install numpy
!pip install wordcloud
!pip install scikit-learn



# Importations

In [48]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from nltk import pos_tag
import pickle
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity


In [49]:
nltk.download("all")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/overwatch2009/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/overwatch2009/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/overwatch2009/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/overwatch2009/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/overwatch2009/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru

True

# Functions

In [80]:
def reader(filepath):
    df = pd.read_csv(filepath, encoding = 'utf8')    
    return df    

def tokenizer(df):
    try:
        tokenized_book = df
        tokenized_book["summary"] = df["summary"].map(lambda x: word_tokenize(x.lower() if isinstance(x, str) else str(x) ))
    except Exception as e:
        print("An error occurs when tokenizing")
    return tokenized_book

def stopwords_remover(df):    
    # with open("../utils/stopwords.txt") as file:
    #     custom_stopwords = file.read().split(",")
    try:   
        stop_words = set(stopwords.words('french'))    
        without_stopwords = df
        without_stopwords["summary"] = df["summary"].map(lambda x: [word for word in x if (word not in stop_words)]) 
    except Exception as e:
        print("An error occurs when removing stopwords")
    return without_stopwords

def punctuation_remover(df):
    try:
        punctuation = string.punctuation + "``" + "''" + "--" + "_" + "(" + ")" + '""' + "|" + "“" + "”" + "’" + "‘" + "___"
        without_punc = df
        without_punc["summary"] = df["summary"].map(lambda x: [word for word in x if word not in punctuation])
    except Exception as e:
        print("An error occurs when removing punctuations")
    return without_punc

def pos_tagger(df):
    try:
        pos_tagged = df
        pos_tagged["summary"] = df["summary"].map(lambda x: [tagged for tagged in pos_tag(x,tagset='universal') if tagged[1] not in ["NUM"] ])
    except Exception as e:
        print("An error occurs when tagging")
    return pos_tagged

def lemmatizer(df):
    try:
        lem = WordNetLemmatizer()
        lemmans = df
        lemmans["summary"] = df["summary"].map(lambda row: [ lem.lemmatize(word[0], pos = get_pos_tag(word[1])) for word in row ])
    except Exception as e:
        print("An error occurs when lemmatizing")
    return lemmans    

def get_pos_tag(pos):    
    match pos:
        case "NOUN":
            result = "n"
        case "VERB":
            result = "v"
        case "ADJ":
            result = "a"
        case "ADV":
            result = "r"
        case _:
            result = "s"
    return result

def search_processing(search, _vector, _lsa):
    try:
        search_df = pd.DataFrame(data=[search], columns=["summary"])
        tokenized_book = tokenizer(search_df)
        without_stopwords = stopwords_remover(tokenized_book)
        without_punctuation = punctuation_remover(without_stopwords)
        tagged_words = pos_tagger(without_punctuation)
        lemmatized_words = lemmatizer(tagged_words)

        vec = _vector.transform(lemmatized_words)
        return vec
        # return _lsa.transform(vec)

    except Exception as e:
        print(f"An error occurs when processing the search:{e}")
    
def best_recommended_pme(search, _vector, _lsa, topic_df, nb_recommandations):
    try:
        
        # Récupération du vecteur du livre cible
        search_vector = search_processing(search, _vector, _lsa)

        # Calcul des similarités cosinus
        similarities = topic_df.apply(
            lambda x: cosine_similarity(search_vector, x.to_numpy().reshape(1, -1))[0][0], axis=1
        )

        # Conversion en DataFrame
        recommandations = similarities.to_frame(name="Similarity")

        # Tri décroissant des similarités
        recommandations.sort_values(by="Similarity", ascending=False, inplace=True)

        # Sélection des meilleurs résultats
        result = recommandations.head(min(nb_recommandations, len(recommandations)))

        if result.empty:
            print("Aucune recommandation disponible.")
            return []

        print(f"Recommandations générées avec succès pour la recherche '{search}'.\n")
        return list(result.itertuples(index=True, name=None))
    
    except Exception as e:
        print(f"Une erreur est survenue lors de la génération des recommandations : {e}")
        return []


def words_frequency(df):    
    all_words = []    
    df["summary"].map(lambda row: all_words.extend(row))
    df["summary"] = pd.DataFrame({"words": all_words}).value_counts().reset_index()
    return df

def word_cloud(words_df, title):
    words_dict = dict(zip(words_df["words"], words_df["count"]))
    wc = WordCloud(width=800, height=400, background_color="white", max_words=100).generate_from_frequencies(words_dict)    
    plt.figure(figsize=(10,6))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title)
    plt.savefig(title + ".png")
    plt.show()

def tf_idf_calculator(category, content):    
    try:
        vectorizer = TfidfVectorizer(strip_accents = "ascii", max_df = 0.6)
        m = vectorizer.fit_transform(content).transpose()
                
    except Exception as e:
       print("An error occurs when calculating terms frequency")
    return  (pd.DataFrame( data = m.toarray(), index = vectorizer.vocabulary_, columns = category), vectorizer)

In [51]:
try:
    dataset = reader("../data/processed/processed_data.csv")
except Exception as e:
    print(f"Une erreur est survenue lors de la lecture du dataset:\n{e}")

dataset

Unnamed: 0.1,Unnamed: 0,name,category,address,tel,description,summary
0,1,GO AFRICA ONLINE - GAO,,"Lot 938 - 939, non loin de l'Etoile Rouge 04 B...",Tel:+2290169690406,Création de sites internet pour les profession...,Création de sites internet pour les profession...
1,2,JAWUNTAA,,"Rue avant l'hôtel Bénin Royal, immeuble Golden...",+2290194710000,Création; Production audiovisuelle; Média; Mar...,Création; Production audiovisuelle; Média; Mar...
2,3,JK COMMUNICATION,,"Bénin, Cotonou, Zogbo von Tornade ; deuxième r...",+2290153714272,JK COMMUNICATION est spécialisé dans l’impress...,JK COMMUNICATION est spécialisé dans l’impress...
3,4,229 AdsSquare,Agences de communication,BP 1541 Atlantique Department Abomey Calavi - ...,+2290196117373,,Agences de communication
4,5,2BME SERVICE,Agences de communication,2323 Atlantique Department Ouidah - Bénin,+2290160090917,,Agences de communication
...,...,...,...,...,...,...,...
8469,8517,SOLUTIONS DRIVE VTC,Transports VTC - Taxis,Cotonou Cotonou - 229 - Bénin,+2290194613777,Chauffeur Privé Fiable - Service de réservatio...,Transports VTC - TaxisChauffeur Privé Fiable -...
8470,8518,THIERRY BENIN TAXI,Transports VTC - Taxis,- Cotonou - Bénin,+2290167814858,Nous mettons à votre disposition notre véhicul...,Transports VTC - TaxisNous mettons à votre dis...
8471,8519,AMBULANCES DES AMAZONES,Ambulances,- Saint Michel Cotonou - Bénin,+2290141970704,,Ambulances
8472,8520,SERVICE D'AIDE MEDICALE D'URGENCE DU BENIN - S...,Ambulances,Ex-HIA Cotonou - 04 BP 1298 COTONOU - Bénin,+2290168300202,Etablissement intervenant dans le pré-hospital...,AmbulancesEtablissement intervenant dans le pr...


# Word Cloud

In [52]:
# books_words_list = []
try:
    tokenized_book = tokenizer(dataset)
    without_stopwords = stopwords_remover(tokenized_book)
    without_punctuation = punctuation_remover(without_stopwords)
    tagged_words = pos_tagger(without_punctuation)
    lemmatized_words = lemmatizer(tagged_words)
    
except Exception as e:
    print(f"Une erreur est survenue lors du traitement du corpus : {e}")


# Topic Modeling

In [53]:
try:
    # Calcul du TF-IDF
    text = [" ".join(str(elm) for elm in desc) for desc in dataset["summary"].tolist()]
    categories = dataset["name"].to_list()
    document_term_matrix, vector = tf_idf_calculator(categories, text)
    print("TF-IDF calculé avec succès")

except Exception as e:
    print(f"Échec du calcul TF-IDF : {e}")

TF-IDF calculé avec succès


In [54]:
document_term_matrix

Unnamed: 0,GO AFRICA ONLINE - GAO,JAWUNTAA,JK COMMUNICATION,229 AdsSquare,2BME SERVICE,ADHERE Digital,ADJINANKOU GROUP - Agence de Communication à Cotonou,AFRIKAFUN,AFRIQIYA GROUP,AGENCE LES JUMEAUX,...,GROUPE EYON,OCTAVIC-BENIN,OLINEBENIN,SOFT TAXIS,SOLA DRIVE COMPANY,SOLUTIONS DRIVE VTC,THIERRY BENIN TAXI,AMBULANCES DES AMAZONES,SERVICE D'AIDE MEDICALE D'URGENCE DU BENIN - SAMU BENIN,POLICE REPUBLICAINE
creation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
site,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
internet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
professionnels,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
africains,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
avises,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
declenchement,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
relative,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
informatise,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# LSA

In [55]:

try:
    # Vérification de la matrice TF-IDF
    if document_term_matrix is None:
        raise ValueError("La matrice TF-IDF est vide ou non définie.")
    if not hasattr(document_term_matrix, "shape"):
        raise TypeError("document_term_matrix n'est pas une matrice valide.")
    if document_term_matrix.shape[0] == 0 or document_term_matrix.shape[1] == 0:
        raise ValueError(f"Matrice TF-IDF vide : {document_term_matrix.shape}")

    # Application du modèle LSA
    nb_cat = len(dataset["category"].value_counts())
    lsa = TruncatedSVD(n_components=nb_cat)
    document_vectors = lsa.fit_transform(document_term_matrix.transpose())

    # Vérification des tailles
    
    # Création du DataFrame
    topic_columns = [f"Topic {i+1}" for i in range(nb_cat)]
    topic_df = pd.DataFrame(
        document_vectors,
        columns=topic_columns,
        index=dataset["name"].to_list()
    )

    print(f"Réduction LSA effectuée avec succès.")
    display(topic_df)

except Exception as e:
    print(f"Impossible d’effectuer la réduction LSA : {e}")

Réduction LSA effectuée avec succès.


Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,...,Topic 478,Topic 479,Topic 480,Topic 481,Topic 482,Topic 483,Topic 484,Topic 485,Topic 486,Topic 487
GO AFRICA ONLINE - GAO,1.411474e-02,2.388549e-02,8.703230e-04,2.131065e-02,7.508942e-03,1.319377e-03,8.367960e-03,2.314446e-03,2.006409e-02,-2.382562e-03,...,0.015225,-0.024237,0.001782,-0.041367,0.031307,0.022510,0.011764,-0.021601,0.007331,0.037518
JAWUNTAA,1.932045e-02,7.649345e-02,2.649363e-02,9.169401e-02,3.955903e-02,1.444336e-03,1.450203e-02,3.521223e-03,4.893770e-02,-1.361145e-02,...,0.004960,-0.010098,-0.047287,-0.018033,0.030064,-0.038227,0.045433,-0.015848,0.005798,0.011731
JK COMMUNICATION,1.659502e-02,3.221667e-02,1.846314e-02,-2.215699e-03,6.442853e-04,3.755663e-03,1.712572e-02,5.774856e-03,2.928747e-02,1.907503e-02,...,-0.011591,0.008284,0.010196,-0.012363,0.082537,0.033535,-0.015658,-0.019605,0.045848,0.019303
229 AdsSquare,1.342433e-02,1.603019e-01,7.037879e-01,-1.477010e-02,3.201594e-03,-8.451168e-03,-3.354396e-02,-1.168100e-02,-1.335074e-02,-1.341146e-02,...,0.001013,0.004224,-0.003416,0.000453,0.000071,-0.001991,0.001398,0.000491,-0.003438,-0.001463
2BME SERVICE,1.342433e-02,1.603019e-01,7.037879e-01,-1.477010e-02,3.201594e-03,-8.451168e-03,-3.354396e-02,-1.168100e-02,-1.335074e-02,-1.341146e-02,...,0.001013,0.004224,-0.003416,0.000453,0.000071,-0.001991,0.001398,0.000491,-0.003438,-0.001463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SOLUTIONS DRIVE VTC,9.195162e-03,2.789911e-02,1.260094e-03,-8.727688e-03,-3.209487e-03,6.995927e-03,9.377572e-02,-4.528213e-02,4.747088e-03,4.533682e-03,...,0.059693,-0.014140,0.010406,-0.019980,-0.007331,-0.003782,0.022299,0.001423,-0.019534,0.016497
THIERRY BENIN TAXI,8.448542e-03,5.489918e-02,-5.329188e-03,-1.950527e-02,-3.761070e-02,9.174938e-03,1.308505e-01,-6.934980e-02,2.154695e-02,-2.651148e-02,...,-0.009578,-0.012949,0.011276,0.005039,-0.004266,0.010128,0.037970,-0.027633,0.033686,0.036886
AMBULANCES DES AMAZONES,2.170064e-12,3.651136e-12,-3.368628e-12,-2.994819e-11,1.182620e-11,9.689750e-11,-6.919266e-11,-4.108257e-11,1.464580e-10,-1.395920e-11,...,0.000855,-0.000679,0.001642,0.000145,0.000133,0.000189,-0.000683,0.000195,-0.000545,0.000144
SERVICE D'AIDE MEDICALE D'URGENCE DU BENIN - SAMU BENIN,1.401778e-02,6.837444e-02,-1.040394e-02,6.701282e-02,-1.630825e-03,3.209599e-04,1.284052e-02,1.200216e-02,3.613893e-02,-4.133316e-02,...,-0.007178,0.018396,-0.016508,-0.040568,0.047354,0.044268,0.050063,-0.057093,-0.008377,0.044730


# Models exportation

In [56]:
# lsa model export
with open('../pickles/lsa_model.pkl', 'wb') as lsa_model:
    pickle.dump(lsa, lsa_model)                    


# vector export
with open('../pickles/vector.pkl', 'wb') as vector_pkl:    
    pickle.dump(vector, vector_pkl)                    



In [57]:
try:
    # Vérification des objets nécessaires
    if 'lsa' not in locals() or lsa is None:
        raise ValueError("L'objet LSA (TruncatedSVD) n'existe pas ou n'a pas été entraîné.")
    if 'vector' not in locals() or vector is None:
        raise ValueError("Le vecteur TF-IDF (vectorizer) n'est pas défini.")
    if not hasattr(lsa, "components_"):
        raise AttributeError("L'objet LSA ne contient pas d'attribut 'components_'. Avez-vous bien appelé fit_transform() ?")

    # Extraction du vocabulaire
    vocab = vector.get_feature_names_out()
    if len(vocab) == 0:
        raise ValueError("Le vocabulaire TF-IDF est vide. Vérifiez vos données d'entrée.")

    # Affichage des mots les plus représentatifs par thème
    print("=== Mots les plus représentatifs par thème (LSA) ===\n")

    for i, comp in enumerate(lsa.components_):
        vocab_comp = list(zip(vocab, comp))

        if not vocab_comp:
            print(f"Aucun mot trouvé pour le Topic {i+1}.")
            continue

        # Tri des mots les plus importants
        sorted_words = sorted(vocab_comp, key=lambda x: x[1], reverse=True)[:10]

        print(f"Topic {i+1} :")
        print(" ".join([word for word, _ in sorted_words]))
        print()  # saut de ligne

    print("Interprétation des topics terminée avec succès.")

except Exception as e:
    print(f"Une erreur est survenue lors de l'interprétation des topics : {e}")

=== Mots les plus représentatifs par thème (LSA) ===

Topic 1 :
informatique formation securite maintenance materiel ingenierie centre bureautique materiels equipements

Topic 2 :
produits materiels centre equipements medicaux agences agricoles location cosmetiques reception

Topic 3 :
agences immobilieres voyage communication recrutement developpement maritimes immigration douane appui

Topic 4 :
centre medicaux commerciaux sociaux formation culturels cabinet recherche loisirs coworking

Topic 5 :
produits centre cosmetiques pharmaceutiques congeles surgeles commerciaux phytosanitaires dietetiques formation

Topic 6 :
ecoles secondaires auto primaires technique transport bureau fournitures enseignement general

Topic 7 :
transport bureau routier maritime fournitures shipping change mobilier logistiques conseils

Topic 8 :
bureau fournitures change mobilier consultant etudes conseils scolaires vente construction

Topic 9 :
construction vente materiaux centre voitures location fabricati

In [58]:
try:
    # Vérification de la DataFrame
    if 'topic_df' not in locals() or topic_df is None:
        raise ValueError("Le DataFrame 'topic_df' n'est pas défini.")
    if topic_df.empty:
        raise ValueError("Le DataFrame 'topic_df' est vide. Vérifiez vos données.")
    
    # Vérification des types numériques
    if not all(pd.api.types.is_numeric_dtype(topic_df[col]) for col in topic_df.columns):
        raise TypeError("Certaines colonnes de 'topic_df' ne sont pas numériques.")

    # Conversion en valeurs absolues (pour éliminer les valeurs négatives)
    to_positive = topic_df.applymap(lambda num: abs(num) if pd.notnull(num) else 0)

    # Identifier le topic dominant pour chaque livre
    dominant_topics = to_positive.idxmax(axis=1).to_frame().reset_index()

    # Regrouper les livres par topic
    books_by_topic = dominant_topics.groupby(0)["index"].apply(list).to_dict()

    # Vérification du résultat
    if not books_by_topic:
        print("Aucun regroupement n’a été trouvé.")
    else:
        print("=== Regroupement des livres par thème dominant ===\n")
        for key, value in books_by_topic.items():
            print(f"{key}:")
            print(", ".join(value))
            print()

    print("Regroupement terminé avec succès.")

except Exception as e:
    print(f"Une erreur est survenue lors du regroupement par topics : {e}")

  to_positive = topic_df.applymap(lambda num: abs(num) if pd.notnull(num) else 0)


=== Regroupement des livres par thème dominant ===

Topic 1:
CFPJ - MAN, A DIEU LE DERNIER MOT, ATARA SERVICE, AIRTECH-BENIN, FRED SERVICES, ABS TECHNOLOGIE GROUP, DGA STORE, PAPETERIE MODERNE CALAVI, AKONTA, TEC SARL, ALLMIGHTY EXPERTISES, ADJINOS CENTER, CIBLE INFORMATIQUE, 3J IT SERVICES, ABP TECHNOLOGIE, BENSOFT SERVICE, CHABO, DIGI SPHERE TECH GROUP, IFE AFRICA, M-TECHNOLOGIES, LA PASSION DE LA TECHNOLOGIE, 3D TELECOM BUSINESS CENTER, A.D. INFORMATIQUE SERVICE ET FILS, ABP SERVICE, ADEWALE CENTER, ADIS COMMUNICATION, AGEMA BENIN, AGRO GROUP DEVELOPPEMENT, ALL IN ONE BENIN, ARTHUR INFORMATIQUE, BEAUNACH BUSINESS CENTER CONSULTING, BELANCE BENIN SARL, BON SAMARITAIN FORMATION, CENTRE DE HAUTE TECHNOLOGIE AWOLOU ET FILS, CENTRE INFORMATIQUE LE LEADER, CENTRE MULTIMEDIA SANDY, CFAO Infrastructure, CHALLENGE CONSULTING, CHEZ LE MAINTENANCIER, CHRIST ROI MULTIMEDIA DECOR, CTRFII BENIN, DGM INFORMATIQUE, DIAMOND ELECTRONIC, ETS AODS ET FILS, ETS BENIE INFO CENTER, SYSCOM AFRIQUE, OPEN SO

# Document similarity

In [81]:
best_recommended_pme("Je recherche une agence de communication digitale", vector, lsa, topic_df, 3)

Une erreur est survenue lors de la génération des recommandations : Incompatible dimension for X and Y matrices: X.shape[1] == 10696 while Y.shape[1] == 487


[]

In [86]:
search_df = pd.DataFrame(data=["Je recherche une agence de communication digitale"], columns=["summary"])
tokenized_book = tokenizer(search_df)
without_stopwords = stopwords_remover(tokenized_book)
without_punctuation = punctuation_remover(without_stopwords)
tagged_words = pos_tagger(without_punctuation)
lemmatized_words = lemmatizer(tagged_words)
vec = vector.transform(lemmatized_words)
vec
lsa.transform(vec)
vec



<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 0 stored elements and shape (1, 10696)>