 ##### Sentiment analysis is the interpretation and classification of text-based data. The point of this analysis is to categorize each data-point into a class that represents its quality (positive, negative, etc.). Sentiment analysis focuses on the polarity, emotions, and intentions of authors. Classic sentiment analysis consists of the following steps: preprocessing, training, feature extraction,and classification.

### Importing data from sqlite database

In [None]:
import pandas as pd

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import sqlite3

In [None]:
con= sqlite3.connect(r"C:\Users\dines\Downloads\database.sqlite")

In [None]:
df= pd.read_sql_query('SELECT* FROM Reviews',con)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df['HelpfulnessNumerator']>df['HelpfulnessDenominator'] ##invalid row

In [None]:
df[df['HelpfulnessNumerator']>df['HelpfulnessDenominator']]

In [None]:
df_valid=df[df['HelpfulnessNumerator']<=df['HelpfulnessDenominator']]

In [None]:
df_valid.shape

In [None]:
df_valid.duplicated(['UserId','ProfileName','Time','Text'])

### Performing Sentiment Analysis on Data

In [None]:
df_valid[df_valid.duplicated(['UserId','ProfileName','Time','Text'])]

In [None]:
data=df_valid.drop_duplicates(subset=['UserId','ProfileName','Time','Text'])

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data['Time']=pd.to_datetime(data['Time'],unit='s')

In [None]:
import warnings
from warnings import filterwarnings
filterwarnings('ignore')

 ### What is sentiment analysis?
    Sentiment analysis is the computational task of automatically determining what feelings a writer is expressing in text
    Some examples of applications for sentiment analysis include:

    1.Analyzing the social media discussion around a certain topic
    2.Evaluating survey responses
    3.Determining whether product reviews are positive or negative

    Sentiment analysis is not perfect.It also cannot tell you why a writer is feeling a certain way. However, it can be useful to quickly summarize some qualities of text, especially if you have so much text that a human reader cannot analyze it.For this project,the goal is to to classify Food reviews based on customers' text.

In [None]:
!pip install TextBlob


In [None]:
from textblob import TextBlob


In [None]:
text=df['Summary'][0]

In [None]:
text

In [None]:
TextBlob(text).sentiment.polarity

In [None]:
polarity=[]

for i in df['Summary']:
    try:
        polarity.append( TextBlob(i).sentiment.polarity)
    except:
        polarity.append(0)

In [None]:
len(polarity)

In [None]:
data= df.copy()

In [None]:
data['polarity']=polarity

In [None]:
data.head()

### Perform EDA for positive sentences

In [None]:
data_positive=data[data['polarity']>0]

In [None]:
data_positive.shape

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud,STOPWORDS

In [None]:
stopwords=set(STOPWORDS)

In [None]:
total_text=(''.join(data_positive['Summary']))

In [None]:
total_text

In [None]:
import re

In [None]:
total_text=re.sub('[^a-zA-Z]',' ',total_text)

In [None]:
total_text=re.sub(' +',' ',total_text)

In [None]:
total_text[0:10000]

In [None]:
wordcloud= WordCloud(width=1000,height=500,stopwords=stopwords).generate(total_text)
plt.figure(figsize=(15,5))
plt.imshow(wordcloud)



### Perform EDA for negative sentences

In [None]:
data_negative=data[data['polarity']<0]

In [None]:
data_negative.shape

In [None]:
total_text2=(''.join(data_negative['Summary']))

In [None]:
total_text2=re.sub('[^a-zA-Z]',' ',total_text2)

In [None]:
total_text2=re.sub(' +',' ',total_text2)

In [None]:
wordcloud2= WordCloud(width=1000,height=500,stopwords=stopwords).generate(total_text2)
plt.figure(figsize=(15,5))
plt.imshow(wordcloud2)
plt.axis('off')


## Analysing what customers amazon should recommend more products.

#### Amazon can recommend more products to only those who are going to buy more or to one who has a better conversion rate,so lets ready data according to this problem statement



In [None]:
df['UserId'].nunique()

In [None]:
df.head()

In [None]:
raw=df.groupby('UserId').agg({'Summary':'count','Text':'count','Score':'mean','ProductId':'count'}).sort_values(by='Text',ascending=False)

In [None]:
raw

In [None]:
raw.columns=['no_of_summary','num_text','avg_score','no_of_prod_purchased']
raw

In [None]:
user_10=raw.index[0:10]

In [None]:
num_10=raw['no_of_prod_purchased'][0:10]

In [None]:
plt.bar(user_10,num_10,label='most recommended user')
plt.xlabel('UserId')
plt.ylabel('no_of_prod_purchased')
plt.xticks(rotation='vertical')

#### These are the Top 10 Users so we can recommend more & more Prodcuts to these Usser Id as there will be a high probability that these person are going to be buy more




### Which Product has a good number of reviews

In [None]:
data['ProductId'].nunique()

In [None]:
prod_count=data['ProductId'].value_counts().to_frame()

In [None]:
prod_count

In [None]:
prod_count[prod_count['ProductId']>500]

In [None]:
freq_prod_ids=prod_count[prod_count['ProductId']>500].index

In [None]:
data['ProductId'].isin(freq_prod_ids)

In [None]:
freq_prod_df=data[data['ProductId'].isin(freq_prod_ids)]

In [None]:
freq_prod_df

In [None]:
freq_prod_df.columns

In [None]:
sns.countplot(y='ProductId', data=freq_prod_df, hue='Score')

### IS there any difference between behaviour of frequent user and not frequent user?

In [None]:
x=data['UserId'].value_counts()

In [None]:
x

In [None]:
data['viewer_type']=data['UserId'].apply(lambda user: 'Frequent' if x[user]>50 else 'Not Frequent')

In [None]:
data.head(5)

In [None]:
not_freq_viewer=data[data['viewer_type']=='Not Frequent']
freq_viewer=data[data['viewer_type']=='Frequent']

In [None]:
freq_viewer['Score'].value_counts()/len(freq_viewer)*100

In [None]:
not_freq_viewer['Score'].value_counts()/len(not_freq_viewer)*100

In [None]:
freq_viewer['Score'].value_counts().plot(kind='bar')

In [None]:
not_freq_viewer['Score'].value_counts().plot(kind='bar')

### The distribution of ratings among frequent reviewers is similar to that of all reviews. 
### However, we can see that frequent reviewers give less 5-star reviews and less 1-star review.
### Frequent users appear to be more discerning in the sense that they give less extreme reviews than infrequent reviews.

## Are frequent users more verbose?

In [None]:
data['Text'][0]

In [None]:
type(data['Text'][0])

In [None]:
type(data['Text'][0].split(' '))

In [None]:
len(data['Text'][0].split(' '))

In [None]:
def calculate_len(text):
    return len(text.split(' '))

In [None]:
data['Text_lenght']=data['Text'].apply(calculate_len)

In [None]:
not_freq_data=data[data['viewer_type']=='Not Frequent']
freq_data=data[data['viewer_type']=='Frequent']

In [None]:
not_freq_data

In [None]:
fig=plt.figure()
ax1=fig.add_subplot(121)
ax1.boxplot(freq_data['Text_lenght'])
ax1.set_xlabel('frequency of frequent reviewers')

ax2=fig.add_subplot(122)
ax2.boxplot(not_freq_data['Text_lenght'])
ax2.set_xlabel('frequency of not frequent reviewers')


#### The distributions of word counts for frequent and infrequent reviews shows that 
#### infrequent reviewers have a large amount of reviews of low word count.
#### On the other hand, the largest concentration of word count is higher for frequent reviewers than for infrequent reviews. 


### Analyse Length of Comments whether Customers are going to give Lengthy comments or short one


In [None]:
final=df[0:2000]

In [None]:
final.head()

In [None]:
final.isnull().sum()

In [None]:
final.duplicated().sum()

In [None]:
len(final['Text'][0].split(' '))

In [None]:
def calc_length(text):
    return len(text.split(' '))

In [None]:
final['Text_length']=final['Text'].apply(calc_length)

In [None]:
import plotly.express as px

In [None]:
px.box(final,y='Text_length')

#### Conclusion-->>
    Seems to have Almost 50 percent users are going to give their Feedback limited to 50 words whereas there are only few users who are going give Lengthy Feedbacks

### Analysing score

In [None]:
sns.countplot(final['Score'])

### Analysing behavior of customers

### Text preprocessing

In [None]:
final['Text'][0]

In [None]:
final['Text']=final['Text'].str.lower()

In [None]:
data=final['Text'][164]

In [None]:
punctuation= '''@!#$%^*()<:;{}?/[]'''
data= final['Text'][164]
no_punc =''
for char in data:
    if char not in punctuation:
        no_punc=no_punc+char
no_punc
    

In [None]:
import string
punctuations=string.punctuation

def remove_punc(review):
    no_punc =''
    for char in review:
        if char not in punctuations:
            no_punc=no_punc+char
    return no_punc

In [None]:
final['Text']=final['Text'].apply(remove_punc)

In [None]:
final.head()


In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
data=final['Text'][164]

In [None]:
data

In [None]:
re=[word for word in data.split(' ') if word not in set(stopwords.words('english'))]

str=' '
for wd in re:
    str=str+wd
    str=str+' '

str

In [None]:
def remove_stopword(review):
    return' '.join([word for word in review.split(' ') if word not in set(stopwords.words('english'))])

In [None]:
final['Text']=final['Text'].apply(remove_stopword)

In [None]:
final['Text'][45]

In [None]:
final['Text'].str.contains('http').sum()

In [None]:
pd.set_option('display.max_row',2000)
final['Text'].str.contains('http')


In [None]:
review=final['Text'][21]

In [None]:
review

In [None]:
import re

In [None]:
url_pattern=re.compile(r'href|http.\w+')
url_pattern.sub(r'',review)

In [None]:
def remove_url(review):
    url_pattern=re.compile(r'href|http.\w+')
    return url_pattern.sub(r'',review)

In [None]:
final['Text']=final['Text'].apply(remove_url)

In [None]:
final['Text'][34]

In [None]:
final['Text'][34].replace(' br ','')

In [None]:
for i in range (len(final['Text'])):
    final['Text'][i].replace(' br ','')
    

In [None]:
final.head()

In [None]:
comment_words=' '.join(final['Text'])

In [None]:
stopwords=set(STOPWORDS)

In [None]:
wordcloud3= WordCloud(width=1000,height=500,stopwords=stopwords).generate(comment_words)
plt.figure(figsize=(15,5))
plt.imshow(wordcloud3)
plt.axis('off')
