# **NLP Project 2 : Supervised Learning**
## Léo RINGEISSEN and Santiago MARTIN

# Library Importations

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import glob

# Data Import and Aggregation

In [3]:
# Step 1: Gather all Excel file paths
file_paths = glob.glob("Traduction avis clients/*.xlsx")

# Step 2: Load and combine files
dataframes = []
for file in file_paths:
    df = pd.read_excel(file)
    dataframes.append(df)

# Combine all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Select relevant columns
selected_columns = ["note", "produit", "type", "avis_en"]  # French column names
combined_df = combined_df[selected_columns]

# Rename columns to English
combined_df.columns = ["rating", "product", "type", "review"]

# Manual Translation of `product` Column

In [4]:
# Correct product translations
product_translations = {
    "animaux": "animals",
    "auto": "car",
    "credit": "credit",
    "garantie-decennale": "ten-year warranty",
    "habitation": "home",
    "moto": "motorcycle",
    "multirisque-professionnelle": "professional multi-risk",
    "prevoyance": "accidental injury",
    "responsabilite-civile-professionnelle": "professional liability",
    "sante": "health",
    "vie": "life"
}

# Apply translations to `product` column
combined_df["product"] = combined_df["product"].map(product_translations)

# Clean English Reviews

In [5]:
# Remove faulty reviews and clean the text
combined_df["review"] = (
    combined_df["review"]
    .str.replace(r"\s+", " ", regex=True)  # Replace extra spaces and newlines with a single space
    .str.strip()  # Remove leading/trailing spaces
)

# Filter out rows with "Loading..."
combined_df = combined_df[~combined_df["review"].str.contains("Loading...", na=False)]

# Removing null values (focus on ratings)

In [None]:
print(combined_df.isnull().sum())
combined_df.dropna(inplace=True)
print(combined_df.isnull().sum())

# Summary of Cleaned Data

In [None]:
print("Total rows and columns:", combined_df.shape)

In [None]:
print("\nUnique ratings and counts:")
print(combined_df["rating"].value_counts())

sns.countplot(x="rating", data=combined_df)
plt.title("Distribution of Ratings")
plt.show()

In [None]:
print("\nUnique product categories and counts:")
print(combined_df["product"].value_counts())

sns.countplot(x="product", data=combined_df)
plt.title("Distribution of Products")
plt.show()

In [None]:
print("\nCount of products per rating:")
print(combined_df.groupby("rating")["product"].value_counts())

plt.figure(figsize=(12, 6))
sns.countplot(data=combined_df, x="rating", hue="product")
plt.title("Rating distribution per product category")
plt.show()

In [None]:
print("\nSample data:")
display(combined_df.head())

In [12]:
combined_df.to_csv("combined_reviews.csv", index=False)

# Preprocessing reviews

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    """Cleans and preprocesses text data."""
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # Lemmatize words
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    # Stem words (optional, depending on need)
    # text = ' '.join(stemmer.stem(word) for word in text.split())
    return text

# Apply preprocessing to the review column
combined_df['review'] = combined_df['review'].apply(preprocess_text)

In [None]:
# Check a sample of the preprocessed reviews
display(combined_df['review'].head())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
import numpy as np

# Sample reviews for demonstration
sample_reviews = combined_df['review'].values  # Replace with actual reviews column

# 1. TF-IDF
print("Starting TF-IDF Vectorization...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(sample_reviews)
print("TF-IDF vectorization completed. Shape:", X_tfidf.shape)

# 2. Word2Vec
print("Starting Word2Vec Training...")
# Tokenize the reviews for Word2Vec
tokenized_reviews = [review.split() for review in sample_reviews]
word2vec_model = Word2Vec(sentences=tokenized_reviews, vector_size=300, window=5, min_count=1, workers=4)

def average_word2vec(tokens, model, vector_size=300):
    """Compute average Word2Vec embedding for a tokenized review."""
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

X_w2v = np.array([average_word2vec(tokens, word2vec_model) for tokens in tokenized_reviews])
print("Word2Vec vectorization completed. Shape:", X_w2v.shape)

# 3. Sentence-BERT
print("Starting Sentence-BERT Embedding...")
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
X_sbert = sbert_model.encode(sample_reviews, show_progress_bar=True)
print("Sentence-BERT vectorization completed. Shape:", X_sbert.shape)

# Save the vectorized data for later use
np.savez("vectorized_data.npz", tfidf=X_tfidf.toarray(), w2v=X_w2v, sbert=X_sbert)

print("All vectorization methods completed.")

# Exploratory Data Analysis and Word Frequency
In this section, we'll analyze the cleaned text reviews to highlight frequent words and n-grams. This helps us understand the data distribution and potential features for model building.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Generate n-grams
def plot_ngrams(reviews, n=1, top_n=20):
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english')
    ngram_counts = vectorizer.fit_transform(reviews)
    ngram_freq = ngram_counts.sum(axis=0).A1
    ngram_words = vectorizer.get_feature_names_out()
    ngram_data = sorted(zip(ngram_freq, ngram_words), reverse=True)[:top_n]
    
    plt.barh([x[1] for x in ngram_data], [x[0] for x in ngram_data])
    plt.xlabel("Frequency")
    plt.title(f"Top {n}-grams")
    plt.show()

# Plot frequent words and bigrams
plot_ngrams(df['review'], n=1)  # Unigrams
plot_ngrams(df['review'], n=2)  # Bigrams
plot_ngrams(df['review'], n=3)  # Trigrams

# Generate WordCloud
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(' '.join(df['review']))

# Predicting Ratings Using Classical ML Models
This section focuses on predicting star ratings using text reviews. We'll extract TF-IDF features and train models like Logistic Regression and Random Forest. The performance will be evaluated using metrics such as accuracy, F1-score, and confusion matrix.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# TF-IDF Feature Extraction
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['review'])
y_ratings = df['rating']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_ratings, test_size=0.2, random_state=42)

# Logistic Regression for Ratings
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Evaluation
print("Classification Report for Ratings:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=lr.classes_, yticklabels=lr.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix for Ratings")
plt.show()

# Predicting Product Categories Using Classical ML Models
In this section, we predict the product category associated with each review. We'll use TF-IDF for feature extraction and train models like Logistic Regression and Random Forest. The evaluation focuses on metrics such as accuracy and confusion matrices.

In [None]:
# Labels for Products
y_products = df['product']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_products, test_size=0.2, random_state=42)

# Logistic Regression for Products
lr_product = LogisticRegression(max_iter=1000, multi_class="ovr")
lr_product.fit(X_train, y_train)
y_pred_product = lr_product.predict(X_test)

# Evaluation
print("Classification Report for Products:")
print(classification_report(y_test, y_pred_product))

# Confusion Matrix
cm_product = confusion_matrix(y_test, y_pred_product, labels=lr_product.classes_)
sns.heatmap(cm_product, annot=True, fmt="d", cmap="Blues", xticklabels=lr_product.classes_, yticklabels=lr_product.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix for Products")
plt.show()

In [None]:
### Step 2: Group Ratings

def group_ratings(row):
    if row['rating'] == 1:
        return 'Bad'
    elif row['rating'] in [2, 3]:
        return 'Neutral'
    elif row['rating'] == 4:
        return 'Good'
    elif row['rating'] == 5:
        return 'Great'

combined_df['rating_grouped'] = combined_df.apply(group_ratings, axis=1)

# Visualize grouped ratings
plt.figure(figsize=(8, 5))
sns.countplot(data=combined_df, x='rating_grouped', order=['Bad', 'Neutral', 'Good', 'Great'])
plt.title("Distribution of Grouped Ratings")
plt.show()

# Predicting Ratings Using Classical ML Models
This section focuses on predicting star ratings using text reviews. We'll extract TF-IDF features and train models like Logistic Regression and Random Forest. The performance will be evaluated using metrics such as accuracy, F1-score, and confusion matrix.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# TF-IDF Feature Extraction
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(combined_df['review'])
y_ratings = combined_df['rating_grouped']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_ratings, test_size=0.2, random_state=42)

# Logistic Regression for Ratings
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Evaluation
print("Classification Report for Ratings:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=lr.classes_, yticklabels=lr.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix for Ratings")
plt.show()

# Predicting Product Categories Using Classical ML Models
In this section, we predict the product category associated with each review. We'll use TF-IDF for feature extraction and train models like Logistic Regression and Random Forest. The evaluation focuses on metrics such as accuracy and confusion matrices.

In [None]:
# Labels for Products
y_products = combined_df['product']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_products, test_size=0.2, random_state=42)

# Logistic Regression for Products
lr_product = LogisticRegression(max_iter=1000, multi_class="ovr")
lr_product.fit(X_train, y_train)
y_pred_product = lr_product.predict(X_test)

# Evaluation
print("Classification Report for Products:")
print(classification_report(y_test, y_pred_product))

# Confusion Matrix
cm_product = confusion_matrix(y_test, y_pred_product, labels=lr_product.classes_)
sns.heatmap(cm_product, annot=True, fmt="d", cmap="Blues", xticklabels=lr_product.classes_, yticklabels=lr_product.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix for Products")
plt.show()

In [None]:
### Step 3: Upsample Underrepresented Products with Paraphrasing
from transformers import pipeline
import random

def augment_reviews_with_paraphrasing_batched(df, product_column, review_column, target_count, batch_size=10):
    paraphraser = pipeline("text2text-generation", model="t5-small", device=-1)

    augmented_data = []

    for product, group in df.groupby(product_column):
        deficit = max(0, target_count - len(group))
        if deficit == 0:
            continue

        reviews_to_augment = random.choices(group[review_column].tolist(), k=deficit)

        # Batch processing
        for i in range(0, len(reviews_to_augment), batch_size):
            batch = reviews_to_augment[i : i + batch_size]
            try:
                paraphrased = paraphraser(
                    [f"paraphrase: {review}" for review in batch],
                    max_length=60,
                    num_return_sequences=1,
                    truncation=True,
                )
                augmented_data.extend(
                    {
                        product_column: product,
                        review_column: result["generated_text"],
                        "rating_grouped": group["rating_grouped"].iloc[0],
                    }
                    for result in paraphrased
                )
            except Exception as e:
                print(f"Error during paraphrasing batch: {e}")
                continue

    return pd.DataFrame(augmented_data)

# Determine the target count for underrepresented products
median_count = combined_df["product"].value_counts().median()

# Apply the function to upsample
augmented_reviews = augment_reviews_with_paraphrasing_batched(
    combined_df, "product", "review", int(median_count)
)

# Combine the augmented data with the original dataframe
augmented_df = pd.concat([combined_df, augmented_reviews], ignore_index=True)

print("Data Shape After Augmentation:", augmented_df.shape)

In [None]:
### Step 4: Verify Distribution After Augmentation
print("Product Distribution After Augmentation:")
print(augmented_df["product"].value_counts())
augmented_df.to_csv("augmented_reviews.csv", index=False)

In [38]:
### Step 5: Modeling Preparation
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(augmented_df['review'])

# Encode labels for grouped ratings and products
from sklearn.preprocessing import LabelEncoder

rating_encoder = LabelEncoder()
product_encoder = LabelEncoder()

augmented_df['rating_encoded'] = rating_encoder.fit_transform(augmented_df['rating_grouped'])
augmented_df['product_encoded'] = product_encoder.fit_transform(augmented_df['product'])

# Split data
from sklearn.model_selection import train_test_split

# For ratings prediction
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, augmented_df['rating_encoded'], test_size=0.2, random_state=42)

# For product prediction
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X, augmented_df['product_encoded'], test_size=0.2, random_state=42)

In [None]:
### Step 6: Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Logistic Regression for Ratings
clf_ratings = LogisticRegression(max_iter=500)
clf_ratings.fit(X_train_r, y_train_r)
ratings_pred = clf_ratings.predict(X_test_r)
print("Classification Report for Ratings:")
print(classification_report(y_test_r, ratings_pred, target_names=rating_encoder.classes_))

# Logistic Regression for Products
clf_products = LogisticRegression(max_iter=500)
clf_products.fit(X_train_p, y_train_p)
products_pred = clf_products.predict(X_test_p)
print("Classification Report for Products:")
print(classification_report(y_test_p, products_pred, target_names=product_encoder.classes_))
