In [52]:

# @@@@@@@@@@@@@@@@@@@ DATA LOADING @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

import pandas as pd

# Load data using raw string literal
df = pd.read_excel(r'C:\Users\Sham Sunder Chawla\Desktop\Big Data\15. Capstone Case Study - NLP- Woman Clothing E-Commerce Platform (2)\Womens Clothing Reviews Data.xlsx')



# @@@@@@@@@@@@@@@@@@@ DATA CLEANING @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@


# DATA CLEANING-Step 1: 

# Check for missing values (NaNs) in each column

missing_values_per_column = df.isnull().sum()

# Check for empty strings in each column
empty_strings_per_column = (df == '').sum()

# Combine the counts of missing values and empty strings per column
total_missing_or_empty_per_column = missing_values_per_column + empty_strings_per_column

print("Total Rows with Missing Values or Empty Strings in Each Column:")
print(total_missing_or_empty_per_column)





# Drop rows with missing values or empty strings in specific columns

columns_to_check = ['Category', 'Subcategory1', 'SubCategory2']
df = df.dropna(subset=columns_to_check, how='any')

# Reset index if you want consecutive integer indices after dropping rows

df.reset_index(drop=True, inplace=True)



# DATA CLEANING-Step 2: 


# Iterate over rows and replace empty/missing values in ColumnA with values from ColumnB

for index, row in df.iterrows():
    if pd.isna(row['Review Text']) or row['Review Text'] == '':
        df.at[index, 'Review Text'] = row['Review Title']




# Display the first few rows of the DataFrame
print(df.head())

# Check the shape of the DataFrame
print("Shape of DataFrame:", df.shape)


# @@@@@@@@@@@@@@@ text preprocessing @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

# Step 1 : LOWERCASING


# Lowercase the 'Text' column
df['Review Text_lowercase'] = df['Review Text'].str.lower()


# Drop the column Review text (e.g., 'ColumnC')

df.drop(columns=['Review Text'], inplace=True)


#2 Step 2 : Tokenize 


import nltk
nltk.download('punkt')
  

from nltk.tokenize import word_tokenize



# Tokenize the 'Review Text_lowercase' column
df['Review Text_Tokens'] = df['Review Text_lowercase'].apply(word_tokenize)





#Step 3 : Remove Punctuations

import pandas as pd
import string



# Function to remove punctuation from a list of tokens

def remove_punctuation(tokens):
    return [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]

# Remove punctuation from each list of tokens in the 'Review Text_Tokens' column

df['Review Text_Tokens'] = df['Review Text_Tokens'].apply(remove_punctuation)

# Step 4 : Remove Stopwords

nltk.download('stopwords')
from nltk.corpus import stopwords


# Function to remove stop words from a list of tokens

    def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

# Remove stop words from each list of tokens in the 'Review Text_Tokens' column

df['Review Text_Tokens'] = df['Review Text_Tokens'].apply(remove_stopwords)



# Step 5 : Stemming/Lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')


# Function to perform lemmatization on a list of tokens

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

# Lemmatize tokens in the 'Review Text_Tokens' column

df['Review Text_Tokens'] = df['Review Text_Tokens'].apply(lemmatize_tokens)


# Step 6 : Handling Contractions and Abbreviations

!pip install contractions

import pandas as pd
import contractions



# Function to expand contractions in tokenized text

def expand_contractions(tokens):
    expanded_tokens = []
    for token in tokens:
        expanded_token = contractions.fix(token)
        expanded_tokens.extend(expanded_token.split())  # Split if contraction expands to multiple words
    return expanded_tokens

# Apply contraction expansion to the 'Review Text_Tokens' column

df['Review Text_Tokens'] = df['Review Text_Tokens'].apply(expand_contractions)



# Question a) Performing exploratory analysis on the data to understand the patterns


# Flatten the list of tokenized words (create one list from multiple lists)

all_tokens = [token for sublist in df['Review Text_Tokens'] for token in sublist]

from collections import Counter

# Count word frequencies
word_freq = Counter(all_tokens)

import matplotlib.pyplot as plt

# Visualize the top N most common words
N = 10
top_words = word_freq.most_common(N)
labels, values = zip(*top_words)

plt.bar(labels, values)
plt.xticks(rotation=45)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top {} Most Common Words'.format(N))
plt.show()


# Question B: Perform text mining tasks to understand what most frequent words are using for positive sentiment and negative sentiment. Create word clouds for the positive & negative reviews separately.

import nltk

from nltk.corpus import sentiwordnet as swn

from nltk.corpus import wordnet as wn


""" Convert between the Penn Treebank tags used by NLTK’s pos_tag() to simple WordNet tags """

def penn_to_wn(tag):
    if tag.startswith('N'):
        return wn.NOUN
    if tag.startswith('V'):
        return wn.VERB
    if tag.startswith('J'):
        return wn.ADJ
    if tag.startswith('R'):
        return wn.ADV
    return None

""" Returns the net sentiment score for a word based on its part of speech tag """

def get_sentiment(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return 0

    lemma = wn.morphy(word, wn_tag)
    if not lemma:
        return 0

    synsets = wn.synsets(word, wn_tag)
    if not synsets:
        return 0

    # Sum up all the sentiment scores of all synsets to get an average
    
    sentiment = [swn.senti_synset(synset.name()) for synset in synsets]
    positive = sum([s.pos_score() for s in sentiment]) / len(sentiment)
    negative = sum([s.neg_score() for s in sentiment]) / len(sentiment)
    return positive - negative

def sentiment_score(tokens):
    """ Compute the sentiment score for pre-tokenized text """
    pos_tags = nltk.pos_tag(tokens)

    scores = [get_sentiment(word, tag) for word, tag in pos_tags]
    scores = [score for score in scores if score != 0]  # Remove zero scores
    if not scores:
        return 0
    return sum(scores)

# Applying it to the DataFrame
df['overall_sentiment'] = df['Review Text_Tokens'].apply(sentiment_score)




# create wordcloud

from wordcloud import WordCloud
import matplotlib.pyplot as plt


# Function to generate word clouds
def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(text))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')

# Gather tokens by sentiment orientation
positive_tokens = []
negative_tokens = []

for tokens in df['Review Text_Tokens']:
    pos_tags = nltk.pos_tag(tokens)
    for word, tag in pos_tags:
        orientation = get_sentiment_orientation(word, tag)
        if orientation == 'positive':
            positive_tokens.append(word)
        elif orientation == 'negative':
            negative_tokens.append(word)

# Generate word clouds for positive and negative tokens
generate_wordcloud(positive_tokens, 'Positive Sentiment Words')
generate_wordcloud(negative_tokens, 'Negative Sentiment Words')

plt.show()


# Question C : Understand sentiment among the customers on the different categories, sub categories,products by location and age group

# Aggregating sentiment scores by various dimensions

aggregated_scores = df.groupby(['Category', 'Subcategory1', 'SubCategory2', 'Location', 'Channel']).agg({
    'overall_sentiment': ['mean', 'count', 'std']  # You can adjust the aggregations as needed
}).reset_index()
aggregated_scores.columns = [' '.join(col).strip() for col in aggregated_scores.columns.values]


# Question D : Perform predictive analytics to understand the drivers of customers who are recommending the products


# drivers of recomendation=1

import pandas as pd

# Assuming df is your existing DataFrame

# Filter the DataFrame to include only rows where Recommended_Flag is 1
recommended_df = df[df['Recommend Flag'] == 1]



!pip install pandas nltk afinn


import nltk
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, sentiwordnet as swn
from afinn import Afinn


nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


import string
from nltk.corpus import wordnet

afinn = Afinn()

def find_positive_afinn(tokens):
    positive_words = [word for word in tokens if afinn.score(word) > 0]
    return positive_words

recommended_df['positive_afinn'] = recommended_df['Review Text_Tokens'].apply(find_positive_afinn)


recommended_df['positive_combined'] = recommended_df.apply(lambda row: list(set(row['positive_afinn'] + row['positive_sentiwordnet'])), axis=1)

# Question 5:Create topics and understand themes behind the topics by performing topic mining.


import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import pandas as pd

# Install pyLDAvis if not already installed

import sys
get_ipython().system('{sys.executable} -m pip install pyLDAvis gensim pandas')


# Ensure that the column 'Review Text_Tokens' contains lists of tokens

reviews_tokenized = df['Review Text_Tokens'].tolist()

# Create a dictionary representation of the documents.

dictionary = corpora.Dictionary(reviews_tokenized)

# Filter out extremes to limit the number of features

dictionary.filter_extremes(no_below=10, no_above=0.5)

# Create the Bag of Words corpus

corpus = [dictionary.doc2bow(review) for review in reviews_tokenized]

# Set parameters for the LDA model
num_topics = 5
lda_model = gensim.models.LdaModel(
    corpus,
    num_topics=num_topics,
    id2word=dictionary,
    passes=15,
    random_state=42
)

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=reviews_tokenized, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}\n')

# Prepare the visualization
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis, 'lda_visualization.html')

# To view the visualization
pyLDAvis.display(vis)






#classification model

get_ipython().system('pip install pandas-profiling')

from pandas_profiling import ProfileReport


# Generate the EDA report
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)

# Save the report to a file
profile.to_file("eda_report.html")



# drop irrelevant variables

columns_to_average=["Product ID","Review Title"]
df=df.drop(columns_to_average,axis=1)


# In[110]:


import pandas as pd
from sklearn.preprocessing import OneHotEncoder
# List of columns to be one-hot encoded
columns_to_encode = ['Category','Subcategory1','SubCategory2','Location','Channel']

# Perform one-hot encoding
one_hot_encoder = OneHotEncoder(sparse=False, drop=None)
encoded_features = one_hot_encoder.fit_transform(df[columns_to_encode])

# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=one_hot_encoder.get_feature_names_out(columns_to_encode))

# Drop the original categorical columns and concatenate the encoded columns
df = df.drop(columns=columns_to_encode).reset_index(drop=True)
df = pd.concat([df, encoded_df], axis=1)


# In[112]:


# drop irrelevant variables

columns_to_average=['Review Text_lowercase','Review Text_Tokens']
df=df.drop(columns_to_average,axis=1)


# In[121]:


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define input features (X) and target variable (y)
X = df.drop(columns=['Recommend Flag'])
y = df['Recommend Flag']


# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# In[122]:


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



# Step 3: Create and train the logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# Step 4: Evaluate the model
y_pred = logistic_regression_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)









In [1]:
import pandas as pd

# Load data using raw string literal
df = pd.read_excel(r'C:\Users\Sham Sunder Chawla\Desktop\Big Data\15. Capstone Case Study - NLP- Woman Clothing E-Commerce Platform (2)\Womens Clothing Reviews Data.xlsx')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Product ID      23486 non-null  int64 
 1   Category        23472 non-null  object
 2   Subcategory1    23472 non-null  object
 3   SubCategory2    23472 non-null  object
 4   Location        23486 non-null  object
 5   Channel         23486 non-null  object
 6   Customer Age    23486 non-null  int64 
 7   Review Title    19676 non-null  object
 8   Review Text     22641 non-null  object
 9   Rating          23486 non-null  int64 
 10  Recommend Flag  23486 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 2.0+ MB


In [4]:
df["Review Title"]=df["Review Title"].fillna("Unknown")

SyntaxError: invalid syntax (2723900923.py, line 1)