In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
from textblob import TextBlob
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
import re
import wordcloud
from PIL import Image


In [None]:
df=pd.read_csv("flipkart_product")
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
#remove punctuations
import string
def rmv_punc(text):
  punct = string.punctuation
  return text.translate(str.maketrans('', '', punct))

In [None]:
df['Summary'] = df['Summary'].apply(lambda x: rmv_punc(x))
df['Price'] = df['Price'].apply(lambda x: rmv_punc(x))

In [None]:
df.head()

In [None]:
#remove character with undefined name
import re
def rmv_char(text):
  text = re.sub('[^a-zA-Z0-9(/)]',' ', text)
  text = re.sub('\s+',' ', text) #remove whitespace
  return text

In [None]:
df['Summary'] = df['Summary'].apply(lambda x: rmv_char(x).lower())
df['Review'] = df['Review'].apply(lambda x: rmv_char(x).lower())
df['Price'] = df['Price'].apply(lambda x: rmv_char(x).lower())
df['ProductName']=df['ProductName'].apply(lambda x:rmv_char(x))


In [None]:
df.head()

In [None]:
df.to_csv("clean_flipkart_Products_Data.csv")

sentiment analysis

In [None]:
sentiments = SentimentIntensityAnalyzer()
df["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in df["Summary"]]
df["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in df["Summary"]]
df["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in df["Summary"]]
df["Compound"] = [sentiments.polarity_scores(i)["compound"] for i in df["Summary"]]
# data.head()
score = df["Compound"].values
sentiment = []
for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
df["Sentiment"] = sentiment
# data.head()

In [None]:
df.drop(df.columns[[5, 6, 7, 8]], axis=1, inplace=True)
df

In [None]:
data.to_csv("Sentiment_product.csv")

Data Preparation

In [None]:
df.info()

In [None]:
def extract_parentheses(text):
    start = text.rfind('(')
    end = text.rfind(')')
    if start != -1 and end != -1:
        return text[start+1:end]
    else:
        return ""

df['ProductFeature'] = df['ProductName'].apply(extract_parentheses)
df.head()

In [None]:
#removing special characters
def extract_price(p):
    expression = r'\d+'
    match = re.findall(expression,p)
    return ''.join(match)

df['Price'] = df['Price'].astype(str)
df['ExtractedPrice'] = df['Price'].apply(extract_price)
df.head()

In [None]:
df.drop('Price', axis = 1, inplace = True)
df.head()

In [None]:
#Check unique values of other columns
df['Rate'].unique()

In [None]:
num = ['5', '3', '1', '4', '2'] 
    
# selecting rows based on condition 
df[~df['Rate'].isin(num)]

In [None]:
#removing rows since they are just numbers
df.drop(df[~df['Rate'].isin(num)].index, inplace = True)
df['Rate'].unique()

sintiment analysis

In [None]:
df['Summary'] = df['Summary'].astype(str)

In [None]:
subjectivity_list = []
polarity_list = []
for review in df['Summary']:
    tb = TextBlob(review)
    subjectivity_list.append(tb.sentiment.subjectivity)
    polarity_list.append(tb.sentiment.polarity)
    
#Add the result to the DataFrame
df['Subjectivity'] = subjectivity_list
df['Polarity'] = polarity_list
df.head()

In [None]:
#If polarity is greater than 0, the text is positive; it is negative if the value is less than 0; it is neutral when the value is equal to 0.
#Subjectivity has a range from 0.0 to 1.0. A higher value means the text is more subjective.

In [None]:
def sentiment_analysis(polarity):
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'
df['Sentiment'] = df['Polarity'].apply(sentiment_analysis)
df.head()

visualization

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(df.Polarity, df.Subjectivity, color = '#C5D4EB')
plt.axhline(y = np.mean(df.Subjectivity), color = '#6C85BD')
plt.axvline(x = np.mean(df.Polarity), color = '#5B61A1')
plt.xlabel("Polarity")
plt.xlim((-1, 1))
plt.ylabel("Subjectivity")
plt.ylim((0, 1))
plt.show()

In [None]:
sns.countplot(x=df["Sentiment"], color = '#C5D4EB')

Word Cloud

In [None]:
ignored_words = set(stopwords.words("english"))
ignored_words.add("bad")
ignored_words.add("good")
ignored_words.add("flipkart")
ignored_words.add("also")
ignored_words.add("like")
ignored_words.add("item")
ignored_words.add("one")
ignored_words.add("much")
ignored_words.add("even")
ignored_words.add("product")

In [None]:
review_list = []
tokens = []
for r in df[df["Sentiment"] == "Negative"]["Summary"]:
    r = r.lower() #lowercase of the review
    r = re.sub("http?:\/\/\S+"," ",r) #Remove hyperlinks if any
    r = re.sub(r"\s+[a-zA-Z]\s+", " ", r) #Remove single characters
    for token in word_tokenize(r):
        if token.isalpha() and token not in ignored_words: #Remove stop words
            tokens.append(token)
s = " ".join(tokens)

In [None]:
#Word Cloud for all reviews
mask = np.array(Image.open("/kaggle/input/dislike/dislike.png"))
mask = np.array(Image.fromarray(mask).resize((600,600)))
wc = wordcloud.WordCloud(background_color='white',
                         width = 600,
                         max_words = 1000,
                         height = 600,
                         stopwords = ignored_words)
wc.generate(s)
image_colors = wordcloud.ImageColorGenerator(mask)
plt.imshow(wc.recolor(color_func=image_colors),interpolation="bilinear")
plt.axis("off")
plt.show()

The customers mainly complain about the quality, value for money, size, and time.