# Installation

In [1]:
!pip install nltk
!pip install pandas
!pip install matplotlib
!pip install numpy
!pip install wordcloud
!pip install scikit-learn



# Importations

In [2]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from nltk import pos_tag
import pickle
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
# nltk.download("all")

In [4]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('universal_tagset')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/orlando/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/orlando/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/orlando/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package wordnet to /home/orlando/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Functions

In [1]:
def reader(filepath):
    df = pd.read_csv(filepath, encoding = 'utf8')    
    return df    

def tokenizer(df):
    try:
        tokenized_book = df
        tokenized_book["summary"] = df["summary"].map(lambda x: word_tokenize(x.lower() if isinstance(x, str) else str(x) ))
    except Exception as e:
        print(f"An error occurs when tokenizing: {e}")
    return tokenized_book

def stopwords_remover(df):    
    # with open("../utils/stopwords.txt") as file:
    #     custom_stopwords = file.read().split(",")
    try:   
        stop_words = set(stopwords.words('french'))    
        without_stopwords = df
        without_stopwords["summary"] = df["summary"].map(lambda x: [word for word in x if (word not in stop_words)]) 
    except Exception as e:
        print(f"An error occurs when removing stopwords: {e}")
    return without_stopwords

def punctuation_remover(df):
    try:
        punctuation = string.punctuation + "``" + "''" + "--" + "_" + "(" + ")" + '""' + "|" + "“" + "”" + "’" + "‘" + "___"
        without_punc = df
        without_punc["summary"] = df["summary"].map(lambda x: [word for word in x if word not in punctuation])
    except Exception as e:
        print(f"An error occurs when removing punctuations: {e}")
    return without_punc

def pos_tagger(df):
    try:
        pos_tagged = df
        pos_tagged["summary"] = df["summary"].map(lambda x: [tagged for tagged in pos_tag(x,tagset='universal') if tagged[1] not in ["NUM"] ])
    except Exception as e:
        print(f"An error occurs when tagging: {e}")
    return pos_tagged

def lemmatizer(df):
    try:
        lem = WordNetLemmatizer()
        lemmans = df
        lemmans["summary"] = df["summary"].map(lambda row: [ lem.lemmatize(word[0], pos = get_pos_tag(word[1])) for word in row ])
    except Exception as e:
        print(f"An error occurs when lemmatizing: {e}")
    return lemmans    

def get_pos_tag(pos):    
    match pos:
        case "NOUN":
            result = "n"
        case "VERB":
            result = "v"
        case "ADJ":
            result = "a"
        case "ADV":
            result = "r"
        case _:
            result = "s"
    return result

def search_processing(search, _vector, _lsa):
    try:
        search_df = pd.DataFrame(data=[search], columns=["summary"])
        tokenized_book = tokenizer(search_df)
        without_stopwords = stopwords_remover(tokenized_book)
        without_punctuation = punctuation_remover(without_stopwords)
        tagged_words = pos_tagger(without_punctuation)
        lemmatized_words = lemmatizer(tagged_words)
        lemmatized_words = lemmatized_words.summary.tolist()
        search = [" ".join(str(elm) for elm in row) for row in lemmatized_words]        
        vec = _vector.transform(search)
        # return vec
        return _lsa.transform(vec)

    except Exception as e:
        print(f"An error occurs when processing the search:{e}")
    
def best_recommended_pme(search, _vector, _lsa, topic_df, nb_recommandations):
    try:
        
        # Récupération du vecteur du livre cible
        search_vector = search_processing(search, _vector, _lsa).reshape(1, -1)

        # Calcul des similarités cosinus
        similarities = topic_df.apply(
            lambda x: cosine_similarity(search_vector, x.to_numpy().reshape(1, -1))[0][0], axis=1
        )

        # Conversion en DataFrame
        recommandations = similarities.to_frame(name="Similarity")

        # Tri décroissant des similarités
        recommandations.sort_values(by="Similarity", ascending=False, inplace=True)

        # Sélection des meilleurs résultats
        result = recommandations.head(min(nb_recommandations, len(recommandations)))

        if result.empty:
            print("Aucune recommandation disponible.")
            return []

        print(f"Recommandations générées avec succès pour la recherche '{search}'.\n")
        formatted_result = []
        for recom in result.itertuples(index=True, name=None):
            formatted_result.append({
                "name" : recom[0],
                "score" : recom[1]
            })            
        return formatted_result
    
    except Exception as e:
        print(f"Une erreur est survenue lors de la génération des recommandations : {e}")
        return []


def words_frequency(df):    
    all_words = []    
    df["summary"].map(lambda row: all_words.extend(row))
    df["summary"] = pd.DataFrame({"words": all_words}).value_counts().reset_index()
    return df

def word_cloud(words_df, title):
    words_dict = dict(zip(words_df["words"], words_df["count"]))
    wc = WordCloud(width=800, height=400, background_color="white", max_words=100).generate_from_frequencies(words_dict)    
    plt.figure(figsize=(10,6))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title)
    plt.savefig(title + ".png")
    plt.show()

def tf_idf_calculator(category, content):    
    try:
        vectorizer = TfidfVectorizer(strip_accents = "ascii", max_df = 0.6)
        m = vectorizer.fit_transform(content).transpose()
                
    except Exception as e:
       print("An error occurs when calculating terms frequency")
    return  (pd.DataFrame( data = m.toarray(), index = vectorizer.vocabulary_, columns = category), vectorizer)

In [None]:
try:
    dataset = reader("../data/processed/processed_data.csv")
except Exception as e:
    print(f"Une erreur est survenue lors de la lecture du dataset:\n{e}")

dataset

# Word Cloud

In [7]:
# books_words_list = []
try:
    tokenized_book = tokenizer(dataset)
    without_stopwords = stopwords_remover(tokenized_book)
    without_punctuation = punctuation_remover(without_stopwords)
    tagged_words = pos_tagger(without_punctuation)
    lemmatized_words = lemmatizer(tagged_words)
    
except Exception as e:
    print(f"Une erreur est survenue lors du traitement du corpus : {e}")


# Topic Modeling

In [8]:
try:
    # Calcul du TF-IDF
    text = [" ".join(str(elm) for elm in desc) for desc in dataset["summary"].tolist()]
    categories = dataset["name"].to_list()
    document_term_matrix, vector = tf_idf_calculator(categories, text)
    print("TF-IDF calculé avec succès")

except Exception as e:
    print(f"Échec du calcul TF-IDF : {e}")

TF-IDF calculé avec succès


In [9]:
document_term_matrix

Unnamed: 0,229 AdsSquare,2BME SERVICE,ADHERE Digital,ADJINANKOU GROUP - Agence de Communication à Cotonou,AFRIKAFUN,AFRIQIYA GROUP,AGENCE LES JUMEAUX,ALL PRO MEDIA,ASCIEL PHARMA SARL BENIN,ATALHOS COMMUNICATION,...,GROUPE EYON,OCTAVIC-BENIN,OLINEBENIN,SOFT TAXIS,SOLA DRIVE COMPANY,SOLUTIONS DRIVE VTC,THIERRY BENIN TAXI,AMBULANCES DES AMAZONES,SERVICE D'AIDE MEDICALE D'URGENCE DU BENIN - SAMU BENIN,POLICE REPUBLICAINE
agences,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
communication,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
communicationadjinankou,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
group,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
agence,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
avises,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
declenchement,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
relative,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
informatise,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# LSA

In [10]:

try:
    # Vérification de la matrice TF-IDF
    if document_term_matrix is None:
        raise ValueError("La matrice TF-IDF est vide ou non définie.")
    if not hasattr(document_term_matrix, "shape"):
        raise TypeError("document_term_matrix n'est pas une matrice valide.")
    if document_term_matrix.shape[0] == 0 or document_term_matrix.shape[1] == 0:
        raise ValueError(f"Matrice TF-IDF vide : {document_term_matrix.shape}")

    # Application du modèle LSA
    nb_cat = len(dataset["category"].value_counts())
    lsa = TruncatedSVD(n_components=nb_cat)
    document_vectors = lsa.fit_transform(document_term_matrix.transpose())

    # Vérification des tailles
    
    # Création du DataFrame
    topic_columns = [f"Topic {i+1}" for i in range(nb_cat)]
    topic_df = pd.DataFrame(
        document_vectors,
        columns=topic_columns,
        index=dataset["name"].to_list()
    )

    print(f"Réduction LSA effectuée avec succès.")
    display(topic_df)

except Exception as e:
    print(f"Impossible d’effectuer la réduction LSA : {e}")

Réduction LSA effectuée avec succès.


Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,...,Topic 478,Topic 479,Topic 480,Topic 481,Topic 482,Topic 483,Topic 484,Topic 485,Topic 486,Topic 487
229 AdsSquare,1.222631e-02,1.475483e-01,7.017221e-01,-3.246876e-02,-6.587832e-03,-5.866607e-03,-2.857484e-02,-7.893311e-03,4.972496e-03,-7.318007e-03,...,-0.002965,0.004119,0.001183,-0.005834,0.000141,-0.002884,0.000046,0.003484,0.000697,0.002258
2BME SERVICE,1.222631e-02,1.475483e-01,7.017221e-01,-3.246876e-02,-6.587832e-03,-5.866607e-03,-2.857484e-02,-7.893311e-03,4.972496e-03,-7.318007e-03,...,-0.002965,0.004119,0.001183,-0.005834,0.000141,-0.002884,0.000046,0.003484,0.000697,0.002258
ADHERE Digital,1.222631e-02,1.475483e-01,7.017221e-01,-3.246876e-02,-6.587832e-03,-5.866607e-03,-2.857484e-02,-7.893311e-03,4.972496e-03,-7.318007e-03,...,-0.002965,0.004119,0.001183,-0.005834,0.000141,-0.002884,0.000046,0.003484,0.000697,0.002258
ADJINANKOU GROUP - Agence de Communication à Cotonou,1.033776e-02,5.989302e-02,1.843701e-01,-1.005176e-02,-2.229495e-03,1.067478e-03,7.159782e-03,-3.273365e-03,-4.252921e-03,1.172433e-02,...,-0.001758,-0.009653,0.053851,-0.011756,-0.030948,-0.064707,0.011489,-0.038974,-0.001331,0.040977
AFRIKAFUN,7.619854e-03,4.308208e-02,1.548716e-01,-8.772798e-03,-1.831525e-03,-6.982913e-05,1.349332e-03,-2.004447e-03,-3.987017e-03,4.400751e-03,...,0.020847,-0.004232,0.026449,0.028202,0.010565,-0.004847,-0.004763,-0.046123,-0.047597,0.012904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SOLUTIONS DRIVE VTC,7.546941e-03,2.123134e-02,1.797802e-03,-5.833037e-03,4.793770e-03,3.584035e-03,8.592281e-02,-5.567207e-02,-1.846085e-03,1.100887e-03,...,0.047706,0.012995,-0.008155,0.005181,0.018191,0.006243,0.028710,0.007740,-0.056846,0.035424
THIERRY BENIN TAXI,6.869463e-03,4.790092e-02,-4.113309e-03,-1.065498e-02,4.509601e-02,4.870686e-03,1.203341e-01,-8.300469e-02,-3.302572e-02,-8.173868e-03,...,0.021033,0.024859,-0.016912,-0.021193,0.001774,0.001498,-0.003182,0.035933,-0.018191,0.003350
AMBULANCES DES AMAZONES,2.185827e-12,3.677650e-11,9.760651e-12,1.162395e-11,5.514259e-11,-4.928468e-11,-5.187088e-11,-3.729943e-11,6.675717e-11,-1.804086e-11,...,-0.002300,0.001489,-0.000780,-0.000448,0.000918,-0.000955,-0.000654,0.001842,0.000185,-0.000737
SERVICE D'AIDE MEDICALE D'URGENCE DU BENIN - SAMU BENIN,1.247998e-02,6.341939e-02,-6.631161e-03,7.161509e-02,8.337905e-04,-2.465246e-04,1.343480e-02,1.065534e-02,-5.149557e-02,-1.398059e-02,...,-0.013150,0.031879,0.048785,-0.003008,-0.026805,0.011734,0.012768,-0.017756,0.089113,-0.065202


# Models exportation

In [None]:
# # lsa model export
# with open('../pickles/lsa_model.pkl', 'wb') as lsa_model:
#     pickle.dump(lsa, lsa_model)                    


# # vector export
# with open('../pickles/vector.pkl', 'wb') as vector_pkl:    
#     pickle.dump(vector, vector_pkl)                    


# # topic_df export
# with open('../pickles/topic_df.pkl', 'wb') as topic_df_pkl:    
#     pickle.dump(topic_df, topic_df_pkl)  


In [12]:
try:
    # Vérification des objets nécessaires
    if 'lsa' not in locals() or lsa is None:
        raise ValueError("L'objet LSA (TruncatedSVD) n'existe pas ou n'a pas été entraîné.")
    if 'vector' not in locals() or vector is None:
        raise ValueError("Le vecteur TF-IDF (vectorizer) n'est pas défini.")
    if not hasattr(lsa, "components_"):
        raise AttributeError("L'objet LSA ne contient pas d'attribut 'components_'. Avez-vous bien appelé fit_transform() ?")

    # Extraction du vocabulaire
    vocab = vector.get_feature_names_out()
    if len(vocab) == 0:
        raise ValueError("Le vocabulaire TF-IDF est vide. Vérifiez vos données d'entrée.")

    # Affichage des mots les plus représentatifs par thème
    print("=== Mots les plus représentatifs par thème (LSA) ===\n")

    for i, comp in enumerate(lsa.components_):
        vocab_comp = list(zip(vocab, comp))

        if not vocab_comp:
            print(f"Aucun mot trouvé pour le Topic {i+1}.")
            continue

        # Tri des mots les plus importants
        sorted_words = sorted(vocab_comp, key=lambda x: x[1], reverse=True)[:10]

        print(f"Topic {i+1} :")
        print(" ".join([word for word, _ in sorted_words]))
        print()  # saut de ligne

    print("Interprétation des topics terminée avec succès.")

except Exception as e:
    print(f"Une erreur est survenue lors de l'interprétation des topics : {e}")

=== Mots les plus représentatifs par thème (LSA) ===

Topic 1 :
informatique formation securite maintenance materiel ingenierie centre bureautique materiels equipements

Topic 2 :
produits materiels centre equipements medicaux agricoles agences cosmetiques pharmaceutiques location

Topic 3 :
agences immobilieres voyage communication recrutement developpement maritimes immigration douane appui

Topic 4 :
centre medicaux commerciaux sociaux cabinet culturels formation recherche loisirs equipements

Topic 5 :
materiels equipements location reception btp electriques industriels securite voitures salles

Topic 6 :
ecoles secondaires auto primaires technique transport bureau enseignement primaire general

Topic 7 :
transport bureau fournitures routier maritime change mobilier shipping logistiques conseils

Topic 8 :
bureau fournitures change mobilier consultant conseils etudes scolaires vente construction

Topic 9 :
cabinet medicaux equipements avocats dentistes dentaires securite sociaux bu

In [13]:
try:
    # Vérification de la DataFrame
    if 'topic_df' not in locals() or topic_df is None:
        raise ValueError("Le DataFrame 'topic_df' n'est pas défini.")
    if topic_df.empty:
        raise ValueError("Le DataFrame 'topic_df' est vide. Vérifiez vos données.")
    
    # Vérification des types numériques
    if not all(pd.api.types.is_numeric_dtype(topic_df[col]) for col in topic_df.columns):
        raise TypeError("Certaines colonnes de 'topic_df' ne sont pas numériques.")

    # Conversion en valeurs absolues (pour éliminer les valeurs négatives)
    to_positive = topic_df.applymap(lambda num: abs(num) if pd.notnull(num) else 0)

    # Identifier le topic dominant pour chaque livre
    dominant_topics = to_positive.idxmax(axis=1).to_frame().reset_index()

    # Regrouper les livres par topic
    books_by_topic = dominant_topics.groupby(0)["index"].apply(list).to_dict()

    # Vérification du résultat
    if not books_by_topic:
        print("Aucun regroupement n’a été trouvé.")
    else:
        print("=== Regroupement des livres par thème dominant ===\n")
        for key, value in books_by_topic.items():
            print(f"{key}:")
            print(", ".join(value))
            print()

    print("Regroupement terminé avec succès.")

except Exception as e:
    print(f"Une erreur est survenue lors du regroupement par topics : {e}")

  to_positive = topic_df.applymap(lambda num: abs(num) if pd.notnull(num) else 0)


=== Regroupement des livres par thème dominant ===

Topic 1:
CFPJ - MAN, ATARA SERVICE, AIRTECH-BENIN, FRED SERVICES, BUSINESS LAND INTER, ABS TECHNOLOGIE GROUP, PAPETERIE MODERNE CALAVI, AKONTA, TEC SARL, ALLMIGHTY EXPERTISES, ADJINOS CENTER, CIBLE INFORMATIQUE, 3128 TECHNOLOGIES, 3J IT SERVICES, ABP TECHNOLOGIE, BENSOFT SERVICE, CHABO, DIGI SPHERE TECH GROUP, IFE AFRICA, M-TECHNOLOGIES, LA PASSION DE LA TECHNOLOGIE, 3D TELECOM BUSINESS CENTER, A.D. INFORMATIQUE SERVICE ET FILS, ABP SERVICE, ADEWALE CENTER, ADIS COMMUNICATION, AGEMA BENIN, AGRO GROUP DEVELOPPEMENT, ALL IN ONE BENIN, ARTHUR INFORMATIQUE, BEAUNACH BUSINESS CENTER CONSULTING, BELANCE BENIN SARL, BON SAMARITAIN FORMATION, CENTRE DE HAUTE TECHNOLOGIE AWOLOU ET FILS, CENTRE INFORMATIQUE LE LEADER, CENTRE MULTIMEDIA SANDY, CFAO Infrastructure, CHEZ LE MAINTENANCIER, CHRIST ROI MULTIMEDIA DECOR, CTRFII BENIN, DGM INFORMATIQUE, DIAMOND ELECTRONIC, ETS AODS ET FILS, ETS BENIE INFO CENTER, SYSCOM AFRIQUE, ADEBA START-UP, ADOBE B

# Document similarity

In [3]:
# lsa model import
with open('../pickles/lsa_model.pkl', 'rb') as lsa_model:    
    lsa = pickle.load(lsa_model)                    


# vector import
with open('../pickles/vector.pkl', 'rb') as vector_pkl:    
    vector = pickle.load(vector_pkl)

# topic_df import
with open('../pickles/topic_df.pkl', 'rb') as topic_df_pkl:    
    topic_df = pickle.load(topic_df_pkl)

In [4]:
best_recommended_pme("Je veux une agence de communication", vector, lsa, topic_df, 3)



Recommandations générées avec succès pour la recherche 'Je veux une agence de communication'.



[{'name': 'NETCEL BENIN', 'score': 0.775708065471625},
 {'name': 'AGENCE CREATIVE STAMINA', 'score': 0.6117780467011708},
 {'name': '2BME SERVICE', 'score': 0.5985910584113515}]

In [16]:
search_df = pd.DataFrame(data=["Je recherche une agence de communication digitale"], columns=["summary"])
tokenized_book = tokenizer(search_df)
without_stopwords = stopwords_remover(tokenized_book)
without_punctuation = punctuation_remover(without_stopwords)
tagged_words = pos_tagger(without_punctuation)
lemmatized_words = lemmatizer(tagged_words)
vec = vector.transform(lemmatized_words)
vec
lsa.transform(vec)
vec



<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 0 stored elements and shape (1, 9893)>