In [None]:
import pandas as pd
import spacy
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [66]:
# Chargement du dataset
clothes_reviews = pd.read_csv("../data/Womens Clothing E-Commerce Reviews.csv")
print(f"taille du dataframe avant nettoyage : {clothes_reviews.shape}")
clothes_reviews.head()

taille du dataframe avant nettoyage : (23486, 11)


Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [7]:
clothes_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


In [67]:
# Suppression des cellules de "Review text" vides
clothes_reviews.dropna(subset=["Review Text"], inplace=True)
clothes_reviews.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [68]:
# Remplissage des cellules vides de "Title" pour limiter les erreurs lors du NLP
clothes_reviews["Title"].fillna("", inplace=True)

# Suppression des lignes presentants des valeurs NaN dans les colonnes : Division Name, Department Name, Class Name	
clothes_reviews.dropna(subset=["Division Name", "Department Name", "Class Name"], inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [69]:
# Création d'une colonne "Full Review" pour reunir un max d'info
clothes_reviews["Full Review"] = clothes_reviews["Title"] + " " + clothes_reviews["Review Text"]
print(f"taille du dataframe après nettoyage : {clothes_reviews.shape}")
clothes_reviews.head()

taille du dataframe après nettoyage : (22628, 12)


Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Full Review
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and com...
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happen...
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,Some major design flaws I had such high hopes ...
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"My favorite buy! I love, love, love this jumps..."
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,Flattering shirt This shirt is very flattering...


In [72]:
# Creation de la fonction de preprocesing (Tokenization -> Stopwords -> Lemmatization)
nlp = spacy.load("en_core_web_sm")
def preprocess_text(text):
    # Création de l'objet Doc spacy
    doc = nlp(text)
    
    # On garde les mots si : 
    # 1. Ce n'est pas un stop-word (is_stop)
    # 2. Ce n'est pas de la ponctuation (is_punct)
    # 3. On prend la forme lemmatisée (.lemma_) et on met en minuscule (.lower())
    clean_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    
    # On rejoint les mots pour refaire une phrase propre
    return " ".join(clean_tokens)

In [12]:
# Test 
print("Début du traitement NLP sur un échantillon...")
sample_df = clothes_reviews.head(5).copy()
sample_df['Clean Text'] = sample_df['Full Review'].apply(preprocess_text)

# Comparaison Avant / Après
for index, row in sample_df.iterrows():
    print(f"\nORIGINAL : {row['Full Review']}")
    print(f"NETTOYÉ  : {row['Clean Text']}")

Début du traitement NLP sur un échantillon...

ORIGINAL :  Absolutely wonderful - silky and sexy and comfortable
NETTOYÉ  :   absolutely wonderful silky sexy comfortable

ORIGINAL :  Love this dress!  it's sooo pretty.  i happened to find it in a store, and i'm glad i did bc i never would have ordered it online bc it's petite.  i bought a petite and am 5'8".  i love the length on me- hits just a little below the knee.  would definitely be a true midi on someone who is truly petite.
NETTOYÉ  :   love dress   sooo pretty   happen find store glad bc order online bc petite   buy petite 5'8   love length me- hit little knee   definitely true midi truly petite

ORIGINAL : Some major design flaws I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up! i reordered it in petite medium, which was just ok. overall, the top half was comforta

In [73]:
# Preprocessing sur notre dataset
clothes_reviews['Clean Text'] = clothes_reviews['Full Review'].apply(preprocess_text)
clothes_reviews.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Full Review,Clean Text
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and com...,absolutely wonderful silky sexy comfortable
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happen...,love dress sooo pretty happen find store...
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,Some major design flaws I had such high hopes ...,major design flaw high hope dress want work in...
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"My favorite buy! I love, love, love this jumps...",favorite buy love love love jumpsuit fun flirt...
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,Flattering shirt This shirt is very flattering...,flattering shirt shirt flattering adjustable t...


In [74]:
# Fonction detection de la polarité des sentiments avec textblob
def get_sentiment(text) :
    return TextBlob(text).sentiment.polarity

# Application de la fonction sur notre texte nettoyé "Clean Text"
clothes_reviews["Sentiment Score"] = clothes_reviews["Clean Text"].apply(get_sentiment)
clothes_reviews.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Full Review,Clean Text,Sentiment Score
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and com...,absolutely wonderful silky sexy comfortable,0.633333
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happen...,love dress sooo pretty happen find store...,0.31875
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,Some major design flaws I had such high hopes ...,major design flaw high hope dress want work in...,0.030163
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"My favorite buy! I love, love, love this jumps...",favorite buy love love love jumpsuit fun flirt...,0.5
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,Flattering shirt This shirt is very flattering...,flattering shirt shirt flattering adjustable t...,0.75


In [75]:
# Categorisation des avis
def categorize_sentiment(score):
    if score > 0.1:
        return 'Positif'
    elif score < -0.1:
        return 'Négatif'
    else:
        return 'Neutre'

clothes_reviews["Sentiment Category"] = clothes_reviews["Sentiment Score"].apply(categorize_sentiment)
clothes_reviews.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Full Review,Clean Text,Sentiment Score,Sentiment Category
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and com...,absolutely wonderful silky sexy comfortable,0.633333,Positif
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happen...,love dress sooo pretty happen find store...,0.31875,Positif
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,Some major design flaws I had such high hopes ...,major design flaw high hope dress want work in...,0.030163,Neutre
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"My favorite buy! I love, love, love this jumps...",favorite buy love love love jumpsuit fun flirt...,0.5,Positif
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,Flattering shirt This shirt is very flattering...,flattering shirt shirt flattering adjustable t...,0.75,Positif


In [76]:
# Vectorisation des mots en chiffres
# max_df=0.95 : Ignorer les mots qui apparaissent dans 95% des avis (trop communs, ex: "vêtement")
# min_df=2 : Ignorer les mots qui apparaissent moins de 2 fois (fautes de frappe, bruit)
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit du vectorizer sur notre Clean Text
tf = tf_vectorizer.fit_transform(clothes_reviews['Clean Text'])
feature_names = tf_vectorizer.get_feature_names_out()

# Modelisation LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)

lda.fit(tf)

In [77]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict[topic_idx] = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        print(f"Topic {topic_idx}:")
        print(topic_dict[topic_idx])
        print("-" * 30)
    return topic_dict

# Affichage des 10 mots les plus importants pour chacun des 5 sujets
no_top_words = 10
display_topics(lda, feature_names, no_top_words)

Topic 0:
love great wear color look jean perfect soft fit sweater
------------------------------
Topic 1:
dress skirt love beautiful color fit waist size picture look
------------------------------
Topic 2:
look like fit size fabric small nice love short color
------------------------------
Topic 3:
size small fit order run wear large love cute medium
------------------------------
Topic 4:
dress wear love fit fabric look flattering beautiful great perfect
------------------------------


{0: 'love great wear color look jean perfect soft fit sweater',
 1: 'dress skirt love beautiful color fit waist size picture look',
 2: 'look like fit size fabric small nice love short color',
 3: 'size small fit order run wear large love cute medium',
 4: 'dress wear love fit fabric look flattering beautiful great perfect'}

In [79]:
# Transformation des avis en probabilités de sujets
topic_values = lda.transform(tf)

# Indexation de la colonne qui a le score le plus élevé (argmax)
clothes_reviews['Topic ID'] = topic_values.argmax(axis=1)

# Mapping des topics
topic_names = {
    0: "Bottoms & Comfort",
    1: "Cut & Fabric",
    2: "Dresses",
    3: "Tops & Quality",
    4: "Sizing Issues"
}

clothes_reviews['Topic Label'] = clothes_reviews['Topic ID'].map(topic_names)

# Vérification finale
clothes_reviews.head(10)

# On sauvegarde le fichier transformé pour ne pas tout relancer dans l'app
clothes_reviews.to_csv("../data/reviews_with_sentiment_and_topics.csv", index=False)