In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from textblob import TextBlob

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
import gensim

import string
import re
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

from sklearn.svm import LinearSVC
from sklearn.tree import ExtraTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.preprocessing import normalize,Normalizer
from sklearn.pipeline import make_pipeline

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

from sklearn.decomposition import LatentDirichletAllocation as LDA

from gensim.models import Word2Vec, KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', None)

In [3]:
# Mounting Colab notebook with Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
# Setting up the Dataset path
dataset_path = '/content/drive/MyDrive/Colab Notebooks/NLP_Model'

In [5]:
data = pd.read_excel(dataset_path+'/Womens Clothing Reviews Data.xlsx')

In [6]:
data.columns = data.columns.str.replace(' ', '_')

In [None]:
data.head(5)

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data['Category'].value_counts()

In [None]:
data['Subcategory1'].value_counts()

In [None]:
## city wise contribution
plt.figure(figsize=(12,8))
city = (data['Location'].value_counts()/data['Location'].count())*100
city = city.sort_index(ascending=True)
explode = [0.1 if city[i] == max(city) else 0 for i in city.index]
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99']
plt.pie(city, labels=city.index, colors=colors, autopct='%1.1f%%', startangle=140, explode=explode)
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.legend(labels=city.index, loc="upper right")
plt.title("City Wise Contribution")
plt.axis('equal')
plt.show()

In [None]:
## Category wise contribution
explode = [0.1 if cat[i] == max(cat) else 0 for i in cat.index]
plt.figure(figsize=(12,8))
cat = (data['Category'].value_counts()/data['Category'].count())*100
cat = cat.sort_index(ascending=True)
colors = ['#ff9999', '#66b3ff', '#99ff99']
plt.pie(cat, labels=cat.index, colors=colors, autopct='%1.1f%%', startangle=140, explode=explode)
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.legend(labels=cat.index, loc="upper right")
plt.title("Category Wise Contribution")
plt.axis('equal')
plt.show()

In [None]:
## Sub_Category wise contribution
plt.figure(figsize=(12,8))
sub_cat = (data['Subcategory1'].value_counts()/data['Subcategory1'].count())*100
sub_cat = sub_cat.sort_index(ascending=True)
explode = [0.1 if sub_cat[i] == max(sub_cat) else 0 for i in sub_cat.index]
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#c2c2f0', '#ffb3e6', '#ff9999']
plt.pie(sub_cat, labels=sub_cat.index, colors=colors, autopct='%1.1f%%', startangle=140, explode=explode)
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.legend(labels=sub_cat.index, loc="upper right")
plt.title("Sub_category Wise Contribution")
plt.axis('equal')
plt.show()

In [None]:
## Rating wise contribution
plt.figure(figsize=(12,8))
info = (data['Rating'].value_counts()/data['Rating'].count())*100
info = info.sort_index(ascending=True)
explode = [0.1 if info[i] == max(info) else 0 for i in info.index]
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#c2c2f0', '#ffb3e6']
plt.pie(info, labels=info.index, colors=colors, autopct='%1.1f%%', startangle=140, explode=explode)
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.legend(labels=info.index, loc="upper right")
plt.title("Rating Wise Contribution")
plt.axis('equal')
plt.show()

In [7]:
## Dropping columns which are not needed
cols = ['Product_ID','SubCategory2','Review_Title']
data.drop(columns=cols, inplace=True)

In [8]:
# creating a column by the name Age_group & dividing the customers accordingly
bins = [0, 40, 60, 99]
labels = ['Youth', 'Adult', 'Senior']
data['Age_group'] = pd.cut(data.Customer_Age, bins, labels = labels,include_lowest = True)

## Dropping Age column as we have Age_group now
data.drop(columns='Customer_Age',inplace=True)

In [None]:
## Age wise contribution
plt.figure(figsize=(12,8))
Age = (data['Age_group'].value_counts()/data['Age_group'].count())*100
Age = Age.sort_index(ascending=True)
explode = [0.1 if Age[i] == max(Age) else 0 for i in Age.index]
colors = ['#ff9999', '#66b3ff', '#99ff99']
plt.pie(Age, labels=Age.index, colors=colors, autopct='%1.1f%%', startangle=140, explode=explode)
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.legend(labels=Age.index, loc="upper right")
plt.title("Age Wise Contribution")
plt.axis('equal')
plt.show()

In [9]:
def replace_null(df):
    # Get a list of columns with the object data type
    object_columns = df.select_dtypes(include='object').columns.tolist()

    # Iterate through each object column and replace null values with mode
    for col in object_columns:
        mode_value = df[col].mode().iloc[0]
        df[col].fillna(mode_value, inplace=True)

    return df

In [None]:
replace_null(data)

In [None]:
data.info()

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


nlp = spacy.load('en_core_web_sm')

def text_preprocessing(text):
    # Remove leading and trailing whitespaces
    text = text.strip()

    # Convert to lowercase
    text = text.lower()

    # Remove digits and special characters using regular expression
    text = re.sub(r"[-()\"#/@;:{}`+=~|._!?,'0-9]", "", text)

    # Tokenize the text using NLTK
    tokens = nltk.word_tokenize(text)

    # Remove stop words using NLTK
    stop = set(nltk.corpus.stopwords.words('english'))
    stop1 = set(list(stop)+['always', 'go', 'got', 'could', 'also', 'get', 'us', 'even', 'i', 'm', 'would', 'do', 'go'])
    tokens = [token for token in tokens if token not in stop1]

    # Correct spelling errors using TextBlob
    #corrected_tokens = [str(TextBlob(token).correct()) for token in tokens]

    # Lemmatize using spaCy
    lemmatized_tokens = [token.lemma_ for token in nlp(" ".join(tokens))]

    # Remove duplicate words
    lemmatized_tokens = list(dict.fromkeys(lemmatized_tokens))

    # Join the tokens back into a cleaned sentence
    cleaned_text = " ".join(lemmatized_tokens)

    return cleaned_text

In [11]:
data['Review_Text'] = data['Review_Text'].apply(lambda x: text_preprocessing(x))

In [12]:
data['sentiment_score'] =  data.Review_Text.apply(lambda x: TextBlob(x).sentiment.polarity)

In [13]:
data['sentiment'] = np.where(data.sentiment_score>0.2,'Positive',np.where(data.sentiment_score<-0.05, 'Negative', 'Neutral'))

In [None]:
pd.crosstab(data.Rating, data.sentiment)

In [15]:
Positive_review = data.Review_Text[data.Rating>=4]
Positive_review = Positive_review.apply(lambda x: text_preprocessing(x))

In [16]:
Negative_review = data.Review_Text[data.Rating<4]
Negative_review = Negative_review.apply(lambda x: text_preprocessing(x))

In [17]:
stop = set(nltk.corpus.stopwords.words('english'))
stop1 = set(list(stop)+['always', 'go', 'got', 'could', 'also', 'get', 'us', 'even', 'i', 'm', 'would', 'do', 'go'])

In [None]:

colormap = 'viridis'
wordcloud =  WordCloud(
                    width=500, height=300,stopwords=stop1,
                    random_state = 123,max_words=200,colormap=colormap,
                    background_color = 'white', max_font_size = 75
                   ).generate(' '.join(Positive_review.astype(str)))

%matplotlib inline
fig = plt.figure(figsize=(200,50))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Positive Reviews', fontsize=100, pad=20)
plt.show()

In [None]:
colormap = 'viridis'
wordcloud =  WordCloud(
                    width=500, height=300,stopwords=stop1,
                    random_state = 123,max_words=200,colormap=colormap,
                    background_color = 'white', max_font_size = 75
                   ).generate(' '.join(Negative_review.astype(str)))

%matplotlib inline
fig = plt.figure(figsize=(200,50))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Negative reviews', fontsize=100, pad=20)
plt.show()

In [None]:
import plotly.graph_objects as go

tab = pd.crosstab(index=data['Category'], columns=data['sentiment'])

# Plotting the grouped bar chart using Plotly
fig = go.Figure()

# Adding trace for each sentiment
sentiments = tab.columns
for sentiment in sentiments:
    fig.add_trace(go.Bar(
        x=tab.index,
        y=tab[sentiment],
        name=sentiment
    ))

# Customizing the layout
fig.update_layout(
    title='Sentiment Counts by Category',
    xaxis_title='Category',
    yaxis_title='Count',
    barmode='group',  # Grouped bar chart
    bargap=0.2,       # Gap between bars in the same location coordinate
    bargroupgap=0.1,  # Gap between bars in different location coordinates
)

# Display the plot
fig.show()

In [None]:
tab_1 = pd.crosstab(index=data['Subcategory1'], columns=data['sentiment'])

# Plotting the grouped bar chart using Plotly
fig = go.Figure()

# Adding trace for each sentiment
sentiments = tab_1.columns
for sentiment in sentiments:
    fig.add_trace(go.Bar(
        x=tab_1.index,
        y=tab_1[sentiment],
        name=sentiment
    ))

# Customizing the layout
fig.update_layout(
    title='Sentiment Counts by Subcategory',
    xaxis_title='Subcategory',
    yaxis_title='Count',
    barmode='group',  # Grouped bar chart
    bargap=0.2,       # Gap between bars in the same location coordinate
    bargroupgap=0.1,  # Gap between bars in different location coordinates
)

# Display the plot
fig.show()

In [None]:
tab_2 = pd.crosstab(index=data['Location'], columns=data['sentiment'])

# Plotting the grouped bar chart using Plotly
fig = go.Figure()

# Adding trace for each sentiment
sentiments = tab_2.columns
for sentiment in sentiments:
    fig.add_trace(go.Bar(
        x=tab_2.index,
        y=tab_2[sentiment],
        name=sentiment
    ))

# Customizing the layout
fig.update_layout(
    title='Sentiment Counts by Loaction',
    xaxis_title='Location',
    yaxis_title='Count',
    barmode='group',
    bargap=0.2,
    bargroupgap=0.1,
  )

# Display the plot
fig.show()

In [None]:
tab_3 = pd.crosstab(index=data['Channel'], columns=data['sentiment'])

# Plotting the grouped bar chart using Plotly
fig = go.Figure()

# Adding trace for each sentiment
sentiments = tab_3.columns
for sentiment in sentiments:
    fig.add_trace(go.Bar(
        x=tab_3.index,
        y=tab_3[sentiment],
        name=sentiment
    ))

# Customizing the layout
fig.update_layout(
    title='Sentiment Counts by Channel',
    xaxis_title='Channel',
    yaxis_title='Count',
    barmode='group',
    bargap=0.2,
    bargroupgap=0.1,
)

# Display the plot
fig.show()

In [None]:
tab_4 = pd.crosstab(index=data['Age_group'], columns=data['sentiment'])

# Plotting the grouped bar chart using Plotly
fig = go.Figure()

# Adding trace for each sentiment
sentiments = tab_4.columns
for sentiment in sentiments:
    fig.add_trace(go.Bar(
        x=tab_4.index,
        y=tab_4[sentiment],
        name=sentiment
    ))

# Customizing the layout
fig.update_layout(
    title='Sentiment Counts by Age_group',
    xaxis_title='Age_group',
    yaxis_title='Count',
    barmode='group',
    bargap=0.2,
    bargroupgap=0.1,
)

# Display the plot
fig.show()

In [19]:
vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                             ngram_range=(1, 1 ),
                             min_df=1,
                             encoding='latin-1',
                             max_features=1000)
Positive_review_count = vect.fit_transform(Positive_review)
DTM_postive = pd.DataFrame(Positive_review_count.toarray(), columns = vect.get_feature_names_out())

In [None]:
import plotly.graph_objects as go
word_freq = DTM_postive.apply(sum).nlargest(30).sort_values(ascending=False)
fig = go.Figure()

fig.add_trace(go.Bar(
    x=word_freq.index,
    y=word_freq.values,
    marker_color='green',  # Set the color of the bars
))

# Customize the layout
fig.update_layout(
    title="Top 30 Words by Frequency in positive Review",
    xaxis_title="Words",
    yaxis_title="Frequency",
    xaxis_tickangle=-45,  # Rotate x-axis labels for better visibility
)

# Show the plot
fig.show()

In [None]:
vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                             ngram_range=(1, 1 ),
                             min_df=1,
                             encoding='latin-1' ,
                             max_features=1000)
Negative_review_count = vect.fit_transform(Negative_review)
DTM_Negative = pd.DataFrame(Negative_review_count.toarray(), columns = vect.get_feature_names_out())

In [None]:
word_freq = DTM_Negative.apply(sum).nlargest(30).sort_values(ascending=False)
fig = go.Figure()

fig.add_trace(go.Bar(
    x=word_freq.index,
    y=word_freq.values,
    marker_color='Red',  # Set the color of the bars
))

# Customize the layout
fig.update_layout(
    title="Top 30 Words by Frequency in Negative Review",
    xaxis_title="Words",
    yaxis_title="Frequency",
    xaxis_tickangle=-45,  # Rotate x-axis labels for better visibility
)

# Show the plot
fig.show()

## Binary Classification Model

In [33]:
## Getting x & y variables
x = data['Review_Text']
y = data['Recommend_Flag']

In [21]:
## Splitting the data intp train & test
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=123)

In [22]:
train_x = train_x.apply(lambda x: text_preprocessing(x))
test_x = test_x.apply(lambda x: text_preprocessing(x))

In [23]:
TFIDF = TfidfVectorizer(analyzer='word',
                             token_pattern=r'\w{1,}',
                             ngram_range=(1, 1 ),
                             min_df=5,
                             encoding='latin-1' ,
                             lowercase = True,
                             max_features=1000)
train_x_TFIDF = TFIDF.fit_transform(train_x)
test_x_TFIDF = TFIDF.transform(test_x)

In [24]:
train_x_DTM = pd.DataFrame(train_x_TFIDF.toarray(), columns=TFIDF.get_feature_names_out())
test_x_DTM = pd.DataFrame(test_x_TFIDF.toarray(), columns=TFIDF.get_feature_names_out())

In [25]:

def train_and_predict_models(train_x, train_y, test_x, test_y):

    # Create the model list
    model_list = [
        ('Random Forest', RandomForestClassifier()),
        ('XG Boost', XGBClassifier()),
        ('Extra Trees', ExtraTreeClassifier()),
        ('Linear SVC', LinearSVC()),
        ('Naive Bayes', MultinomialNB()),
        ('KNN', KNeighborsClassifier())
    ]

    results = {}

    for model_name, model in model_list:
        print(f"Training {model_name}...")
        model.fit(train_x, train_y)

        # Make predictions on training and testing data
        train_pred = model.predict(train_x)
        test_pred = model.predict(test_x)

        # Calculate accuracy for training and testing data
        train_accuracy = accuracy_score(train_y, train_pred)
        test_accuracy = accuracy_score(test_y, test_pred)

        results[model_name] = {
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy
        }

    return results

In [26]:
results = train_and_predict_models(train_x_DTM, train_y, test_x_DTM, test_y)

# Print the results
for model_name, scores in results.items():
    print(f"{model_name} - Train Accuracy: {scores['train_accuracy']:.4f}, Test Accuracy: {scores['test_accuracy']:.4f}")

Training Random Forest...
Training XG Boost...
Training Extra Trees...
Training Linear SVC...
Training Naive Bayes...
Training KNN...
Random Forest - Train Accuracy: 0.9968, Test Accuracy: 0.8622
XG Boost - Train Accuracy: 0.9466, Test Accuracy: 0.8822
Extra Trees - Train Accuracy: 0.9968, Test Accuracy: 0.7988
Linear SVC - Train Accuracy: 0.9065, Test Accuracy: 0.8869
Naive Bayes - Train Accuracy: 0.8572, Test Accuracy: 0.8568
KNN - Train Accuracy: 0.8930, Test Accuracy: 0.8576


### Selecting LinearSVC as the best model as it has almost similar accuracy for train & test dataset

In [None]:
## Performing Grid Search cv for Linear SVC

## Initializing Linear SVC model
LSVC = LinearSVC()

# Define a parameter grid for GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter C
}

# Initialize GridSearchCV
grid_search = GridSearchCV(LSVC, param_grid, cv=5)

# Training the model using GridSearchCV
grid_search.fit(train_x_DTM, train_y)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Predicting for Train & Test dataset using the best estimator
train_pred_LSVC = best_estimator.predict(train_x_DTM)
test_pred_LSVC = best_estimator.predict(test_x_DTM)

# Printing classification report for Train & Test
print("Classification Report for Train Dataset:")
print(classification_report(train_y, train_pred_LSVC))

print("Classification Report for Test Dataset:")
print(classification_report(test_y, test_pred_LSVC))

# Compute accuracy scores for Train & Test
train_accuracy = accuracy_score(train_y, train_pred_LSVC)
test_accuracy = accuracy_score(test_y, test_pred_LSVC)

print(f"Accuracy Score for Train Dataset: {train_accuracy:.4f}")
print(f"Accuracy Score for Test Dataset: {test_accuracy:.4f}")


### Multinomial Classification Model

In [45]:
## Getting x & y variables
X = data['Review_Text']
Y = data['Rating']

In [46]:
## Splitting the data intp train & test
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.3, random_state=123)

In [47]:
train_X = train_X.apply(lambda x: text_preprocessing(x))
test_X = test_X.apply(lambda x: text_preprocessing(x))

In [85]:
TFIDF = TfidfVectorizer(analyzer='word',
                             token_pattern=r'\w{1,}',
                             ngram_range=(1, 1 ),
                             min_df=5,
                             max_df=0.9,
                             encoding='latin-1' ,
                             lowercase = True,
                             max_features=1200)
train_X_TFIDF = TFIDF.fit_transform(train_X)
test_X_TFIDF = TFIDF.transform(test_X)

In [86]:
train_X_DTM = pd.DataFrame(train_X_TFIDF.toarray(), columns=TFIDF.get_feature_names_out())
test_X_DTM = pd.DataFrame(test_X_TFIDF.toarray(), columns=TFIDF.get_feature_names_out())

In [87]:
def Models(train_x, train_y, test_x, test_y):

    # Create the model list
    model_list = [
        ('Random Forest', RandomForestClassifier()),
        ('Extra Trees', ExtraTreeClassifier()),
        ('Linear SVC', LinearSVC()),
        ('Naive Bayes', MultinomialNB()),
        ('KNN', KNeighborsClassifier())
    ]

    results = {}

    for model_name, model in model_list:
        print(f"Training {model_name}...")
        model.fit(train_x, train_y)

        # Make predictions on training and testing data
        train_pred = model.predict(train_x)
        test_pred = model.predict(test_x)

        # Calculate accuracy for training and testing data
        train_accuracy = accuracy_score(train_y, train_pred)
        test_accuracy = accuracy_score(test_y, test_pred)

        results[model_name] = {
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy
        }

    return results

In [88]:
results = Models(train_X_DTM, train_Y, test_X_DTM, test_Y)

# Print the results
for model_name, scores in results.items():
    print(f"{model_name} - Train Accuracy: {scores['train_accuracy']:.4f}, Test Accuracy: {scores['test_accuracy']:.4f}")

Training Random Forest...
Training Extra Trees...
Training Linear SVC...
Training Naive Bayes...
Training KNN...
Random Forest - Train Accuracy: 0.9894, Test Accuracy: 0.6037
Extra Trees - Train Accuracy: 0.9894, Test Accuracy: 0.4903
Linear SVC - Train Accuracy: 0.7089, Test Accuracy: 0.6338
Naive Bayes - Train Accuracy: 0.6156, Test Accuracy: 0.6043
KNN - Train Accuracy: 0.7007, Test Accuracy: 0.5646


### Topic Mining

In [11]:
topic_data = data['Review_Text']

In [20]:
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

# Function for data preprocessing for topic mining
def TP_preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[-()\"#/@;:{}`+=~|._!?,'0-9]", "", text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenization
    stop_words = set(stopwords.words('english'))
    stop1 = set(list(stop_words)+['always', 'go', 'got', 'could', 'also', 'get', 'us', 'even', 'i', 'm', 'would', 'do', 'go','im','ive'])
    tokens = [token for token in tokens if token not in stop1]  # Remove stop words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatization
    return tokens

In [21]:
# Preprocess the data
processed_data = [TP_preprocess_text(text) for text in topic_data]

In [22]:
# Create a dictionary and a document-term matrix
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

In [32]:
# Creating the object for LDA model using gensim library
lda_model = gensim.models.ldamodel.LdaModel

In [33]:
# Running and Trainign LDA model on the document term matrix.
for topics in range(5,20):
    lda = lda_model(corpus, num_topics=topics, id2word = dictionary)
    print("Perplexity: ", topics, lda.log_perplexity(corpus))

Perplexity:  5 -7.084190683059975
Perplexity:  6 -7.130388920072971
Perplexity:  7 -7.164175977617078
Perplexity:  8 -7.230743094984315
Perplexity:  9 -7.295040575051355
Perplexity:  10 -7.382048052027343
Perplexity:  11 -7.468464003112919
Perplexity:  12 -7.5571810470579734
Perplexity:  13 -7.622245875741787
Perplexity:  14 -7.659920018232805
Perplexity:  15 -7.713655188327546
Perplexity:  16 -7.774748257197995
Perplexity:  17 -7.826069483846015
Perplexity:  18 -7.876462755266721
Perplexity:  19 -7.92670244588422


In [34]:
# LDA Topic Modeling
num_topics = 15  # Specify the number of topics you want to discover
Lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

In [35]:
# Print the topics and their top words
for topic_id in range(num_topics):
    print(f"Topic {topic_id + 1}:")
    topic_words = Lda_model.show_topic(topic_id, topn=15)
    print(", ".join([word for word, prob in topic_words]))
    print()

Topic 1:
dress, wear, love, perfect, flattering, beautiful, comfortable, great, fit, slip, summer, well, wearing, work, cant

Topic 2:
xd, wore, first, time, received, day, bought, yellow, one, last, week, review, year, ton, already

Topic 3:
jean, love, great, fit, pant, pair, look, legging, wear, perfect, color, skinny, black, comfortable, bought

Topic 4:
size, store, small, fit, sale, tried, x, ordered, one, retailer, saw, online, wear, medium, usually

Topic 5:
wear, comfortable, great, little, super, work, cute, casual, pant, soft, size, fit, enough, like, love

Topic 6:
&, thread, coral, *, hanging, pear, beach, mother, realized, pink, funny, working, hate, w, impressed

Topic 7:
size, fit, top, small, like, look, ordered, large, dress, fabric, really, back, run, way, didnt

Topic 8:
sleeve, blouse, recommend, highly, real, ruffle, sweatshirt, life, dot, sweet, bell, long, detail, flower, panel

Topic 9:
like, look, fabric, color, picture, much, person, really, photo, model, bac

In [None]:
# Get topic distribution for each document
doc_topics = [Lda_model.get_document_topics(doc) for doc in corpus]
for doc_id, topics in enumerate(doc_topics):
    print(f"Document {doc_id + 1}:")
    for topic_id, prob in topics:
        print(f"Topic {topic_id + 1}: Probability={prob:.4f}")
    print()

In [37]:
# Predefined themes based on top words of each topic
themes = [
    "Dress and Summer Fashion",
    "Recent Purchases and Reviews",
    "Jeans and Pants",
    "Shopping Experiences",
    "Casual and Comfortable Wear",
    "Unique and Quirky Items",
    "Sizing and Fit",
    "Blouses and Tops",
    "Online Shopping Experience",
    "Shirts and Tops",
    "Length and Fit of Skirts",
    "Clothing Care and Washing",
    "Sweaters and Coats",
    "Quality and Jackets",
    "Compliments and Positive Feedback",
]

# Function to get the dominant theme and its probability for each document
def get_dominant_theme(lda_model, doc_term_matrix, themes):
    dominant_themes = []
    for doc in doc_term_matrix:
        topic_probs = lda_model.get_document_topics(doc, minimum_probability=0.0)
        dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
        dominant_themes.append(themes[dominant_topic])
    return dominant_themes

# Get the dominant theme for each document
dominant_themes = get_dominant_theme(Lda_model, corpus, themes)

# Create a DataFrame with the original text reviews and their corresponding dominant themes
df = pd.DataFrame({'Text Review': topic_data, 'Dominant Theme': dominant_themes})

# Print the DataFrame to see the results
df


Unnamed: 0,Text Review,Dominant Theme
0,Absolutely wonderful - silky and sexy and comf...,Dress and Summer Fashion
1,Love this dress! it's sooo pretty. i happene...,Shopping Experiences
2,I had such high hopes for this dress and reall...,Sizing and Fit
3,"I love, love, love this jumpsuit. it's fun, fl...",Quality and Jackets
4,This shirt is very flattering to all due to th...,Shirts and Tops
...,...,...
23481,I was very happy to snag this dress at such a ...,Dress and Summer Fashion
23482,"It reminds me of maternity clothes. soft, stre...",Sizing and Fit
23483,"This fit well, but the top was very see throug...",Shopping Experiences
23484,I bought this dress for a wedding i have this ...,Sizing and Fit
