# Sentiment Analysis on the reviews against Jio network

In this project, we have aimed to analysis the reviews on the Jio network operator and classify them accordingly into satisfaction level as "Satisfied", "Unsatisfied" and "Neutral"

### Link for the Project

https://drive.google.com/drive/folders/1WJvYR2JOuJ4FD4KAc4e3DJjPYZUanOV5?usp=sharing

## Importing all required modules

In [1]:
# pip install seaborn
# pip install numpy
# pip install pandas
# pip install mlxtend.preprocessin
# pip install scipy.stats
# pip install geopy.geocoders
# pip install mpl_toolkits.basemap
# pip install os
# pip install category_encoders

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
from scipy import stats
from mlxtend.preprocessing import minmax_scaling
from scipy.stats import norm
import statistics

In [None]:
df = pd.read_csv("Assgn1-Data-111903129_111903130_111903131_old.csv")

## Scrapped Data Attributes

In [None]:
df

In [None]:
print(df.dtypes)

## Null Values

In [None]:
df.isnull().sum()

# Data Cleaning

## Instance of Geopy

In [None]:
from geopy.geocoders import Nominatim
loc = Nominatim(user_agent="GetLoc")
getLoc = loc.geocode("Mumbai,India")
print(getLoc.latitude,getLoc.longitude)

# Updated Dataset

In [None]:
df = pd.read_csv("Assign1-Data-111903129_111903130_111903131.csv")

In [None]:
df

In [None]:
print(df.dtypes)

In [None]:
df.isnull()

# Heatmap of the Co-relation

In [None]:
sns.heatmap(df.corr())

In [None]:
dataplot = sns.heatmap(df.corr() , cmap="YlGnBu", annot=True)

In [None]:
# sns.pairplot(df,hue='Year')

## Geo-spatial Mapping of the reviewer's location

In [None]:
import os

from mpl_toolkits.basemap import Basemap
from matplotlib import cm
import numpy as np
from numpy import array

In [None]:
plt.subplots(figsize=(20, 15))
map = Basemap(width=1200000,height=900000,projection='lcc',resolution='l',
                    llcrnrlon=67,llcrnrlat=5,urcrnrlon=99,urcrnrlat=37,lat_0=28,lon_0=77)

map.drawmapboundary ()
map.drawcountries ()
map.drawcoastlines ()

lg=array(df['Longitude'])
lt=array(df['Latitude'])


x, y = map(lg, lt)
plt.scatter(x, y, marker="o", cmap=cm.Dark2, alpha=0.7)
plt.title('JIO NETWORK REVIEWS SCATTERPLOT',fontsize=20)

# Outliers Removed Data Set

In [None]:
mydata = pd.read_csv("Outlier_removed_year.csv")

In [None]:
plt.subplots(figsize=(20, 15))
map = Basemap(width=1200000,height=900000,projection='lcc',resolution='l',
                    llcrnrlon=67,llcrnrlat=5,urcrnrlon=99,urcrnrlat=37,lat_0=28,lon_0=77)

map.drawmapboundary ()
map.drawcountries ()
map.drawcoastlines ()

lg=array(mydata['Longitude'])
lt=array(mydata['Latitude'])


x, y = map(lg, lt)
plt.scatter(x, y, marker="o", cmap=cm.Dark2, alpha=0.7)
plt.title('JIO NETWORK REVIEWS SCATTERPLOT',fontsize=20)

In [None]:
import category_encoders as ce
data=mydata["State"]
encoder=ce.OneHotEncoder(cols='State',handle_unknown='return_nan',return_df=True,use_cat_names=True)

#Original Data
data.head()

In [None]:
data_encoded = encoder.fit_transform(data)
data_encoded

In [None]:
plt.rcParams["figure.figsize"] = [15, 7]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots()
df = pd.DataFrame({'States': data})
df['States'].value_counts().plot(ax=ax, kind='bar', xlabel='States', ylabel='Frequency')
plt.show()

# Reviews from different Parts

In [None]:
# Data to plot
labels = "Central","East","West","South","North"
#North = 2607
#south 1606
#central 3965
#west 1791
#east 1890
sizes = [3965,1890,1791,1606,2607]
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue','red']
explode = (0.2, 0, 0, 0,0)  # explode 1st slice

# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True)

plt.axis('equal')
plt.show()

# PreProcessing on Comments

In [None]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression


In [None]:
mydata = pd.read_csv('Outlier_removed_year.csv')

In [None]:
import re
def clean(text):
# Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', text).lower()
    return text

# Cleaning the text in the review column
X = mydata['Comments'].apply(clean)
X.head()

In [None]:
import nltk
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
#nltk.download('stopwords')
from nltk.corpus import stopwords
#nltk.download('wordnet')
from nltk.corpus import wordnet
#nltk.download('averaged_perceptron_tagger')

In [None]:
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

In [None]:
# X = X.apply(token_stop_pos)
# X.head()

X = X[0:1].apply(token_stop_pos)
X

In [None]:
# X = X.apply(lemmatize)
# X.head()
print(X.apply(lemmatize))

## Saving the Lemmatized data to reduce Time

In [None]:
#mydata.to_csv('finalize.csv')

In [None]:
Lemmatize = pd.read_csv('https://drive.google.com/uc?export=download&id=1LLBtx9seVzpYIPH39-uJNKfiP_RLIRGJ')
Lemmatize.head()

# Hashing(Word Level) the lemmatize Comments 

In [None]:
import pandas as pd 

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [None]:
X = Lemmatize["Lemma"]

In [None]:
x = list(X)
x =x[:100]

In [None]:
def shingle(text: str):
    shingle_set = text.split()
    return set(shingle_set)


b = set()
for row in x:
    a = shingle(row)
    b = b.union(a)
    
limit= 10
b 

In [None]:
hot_list = []

for row in x:
    h1 = [1 if x in row else 0 for x in b]
    hot_list.append(h1)
signature = []

print(hot_list)

In [None]:
from random import shuffle
def create_hash_func(size: int):
    # function for creating the hash vector/function
    hash_ex = list(range(1, len(b)+1))
    shuffle(hash_ex)
    return hash_ex

def build_minhash_func(vocab_size: int, nbits: int):
    # function for building multiple minhash vectors
    hashes = []
    for _ in range(nbits):
        hashes.append(create_hash_func(vocab_size))
    return hashes


def create_hash(vector: list):
    # use this function for creating our signatures (eg the matching)
    signature = []
    for func in minhash_func:
        for i in range(1, len(b)+1):
            idx = func.index(i)
            signature_val = vector[idx]
            if signature_val == 1:
                signature.append(i)
                break
    return signature

# we create 20 minhash vectors
minhash_func = build_minhash_func(len(b), 20)
print(len(minhash_func))

In [None]:
signature_matrix = []
for i in range(len(hot_list)):
    signature_matrix.append(create_hash(hot_list[i]))

print('--------------------------------------------------------------------------------')
def matrix_print(m):
    for i in range(len(m)):
        if(i <limit):
            print("\t\t\t",end="")
            print(m[i])
    
print("Signature Matrix :")
matrix_print(signature_matrix)

In [None]:
def jaccard(a: set, b: set):
    return len(a.intersection(b)) / len(a.union(b))



print("Jaccard Similarity : \n")
a = x[0]
for i in range(len(hot_list)):
    a = jaccard(set(signature_matrix[0]),set(signature_matrix[i]))
    if(i < limit):
        print(f"\t\tJaccard similarity of comment1 with comment {(i+1)} :", round(a,2))

In [None]:
def cosine(a,b):
    a = a.split(" ")
    bb = b.split(" ")
    dicc = {}
    c = set(a)
    d = set(b)
    c = c.union(d)
    for i in a :
        if i in dicc:
            a = dicc[i][0]
            b = dicc[i][1]
            dicc[i] = [a+1,b]
        else :
            dicc[i] = [1,0]
    for i in bb:
        if i in dicc:
            a = dicc[i][0]
            b = dicc[i][1]
            dicc[i] = [a,b+1]
        else :
            dicc[i] = [0,1]

    temp = dicc.keys()
    sum_c = 0
    sum_a = 0 
    sum_b = 0
    for i in temp:
        k = dicc[i]
        sum_c += (k[0]*k[1])
        sum_a += (k[0] * k[0])
        sum_b += ( k[1] * k[1])

    return ( sum_c / ( sum_a**(1/2) * sum_b**(1/2)) )
print("Cosine Similarity ")
for i in range(len(hot_list)):
    a = cosine(x[0],x[i])
    if(i < limit):
        print(f"\t\tcosine similarity of comment1 with comment {(i+1)} :", round(a,2))


In [None]:
band = 20
row = 5

lsh_matrix =[]

def split_vector(signature, b):
    assert len(signature) % b == 0
    r = int(len(signature) / b)
    # code splitting signature in b parts
    subvecs = []
    for i in range(0, len(signature), r):
        subvecs.append(signature[i : i+r])
    return subvecs

for i in range(len(hot_list)):
    a = split_vector(signature_matrix[i],band)
    lsh_matrix.append(a)
print("LSH Matrix :")
def th_matrix_print(m):
    for i in range(len(m)):
        if(  i <  limit):
            print("\t\t\t",end="")
            print(m[i])

In [None]:
th_matrix_print(lsh_matrix)

In [None]:
from itertools import combinations
import numpy as np
class LSH:
    buckets = []
    counter = 0
    def __init__(self, b):
        self.b = b
        for i in range(b):
            self.buckets.append({})

    def make_subvecs(self, signature):
        l = len(signature)
        assert l % self.b == 0
        r = int(l / self.b)
        # break signature into subvectors
        subvecs = []
        for i in range(0, l, r):
            subvecs.append(signature[i:i+r])
        return np.stack(subvecs)
    
    def add_hash(self, signature):
        subvecs = self.make_subvecs(signature).astype(str)
        for i, subvec in enumerate(subvecs):
            subvec = ','.join(subvec)
            if subvec not in self.buckets[i].keys():
                self.buckets[i][subvec] = []
            self.buckets[i][subvec].append(self.counter)
        self.counter += 1

    def check_candidates(self):
        candidates = []
        for bucket_band in self.buckets:
            keys = bucket_band.keys()
            for bucket in keys:
                hits = bucket_band[bucket]
                if len(hits) > 1:
                    candidates.extend(combinations(hits, 2))
        return set(candidates)

In [None]:
band = 20 
row = 5

lsh_matrix =[]

def split_vector(signature, b):
    assert len(signature) % b == 0
    r = int(len(signature) / b)
    # code splitting signature in b parts
    subvecs = []
    for i in range(0, len(signature), r):
        subvecs.append(signature[i : i+r])
    return subvecs

for i in range(len(hot_list)):
    a = split_vector(signature_matrix[i],band)
    lsh_matrix.append(a)
print("LSH Matrix :")
def th_matrix_print(m):
    for i in range(len(m)):
        if(  i <  limit):
            print("\t\t\t",end="")
            print(m[i])

    
th_matrix_print(lsh_matrix)

In [None]:
b = 20

lsh = LSH(b)

for signature in signature_matrix:
    lsh.add_hash(signature)
print("Buckets:")
candidate_pairs = lsh.check_candidates()
th_matrix_print(lsh.buckets)

In [None]:
pairs = pd.DataFrame({
    'x': [],
    'y': [],
    'jaccard': [],
    'cosine': [],
    'candidate': []
})

for i in range(len(hot_list)):
    for j in range(i,len(hot_list)):
        candidate = 1 if (i,j) in candidate_pairs else 0
        pairs = pairs.append({
            'x': i,
            'y': j,
            'jaccard': jaccard(set(signature_matrix[i]), set(signature_matrix[j])),
            'cosine': cosine(x[i],x[j]),
            'candidate': candidate
        },ignore_index = True)

In [None]:
cos_min = pairs['cosine'].min()
cos_max = pairs['cosine'].max()
pairs['cosine_norm'] = (pairs['cosine'] - cos_min) / (cos_max - cos_min)

import matplotlib.pyplot as plt
import seaborn as sns

sns.scatterplot(data=pairs, x='cosine', y='candidate', alpha=0.5)

In [None]:
def probability(s, r, b):
    # s: similarity
    # r: rows (per band)
    # b: number of bands
    return 1 - (1 - s**r)**b

def normalize(x, x_min, x_max):
    return (x - x_min) / (x_max - x_min)

In [None]:
probs = pd.DataFrame({
    'P': [],
    's': [],
    'b': []
})

for b in [50, 25,20, 15,10,5]:
    r = int(100/b)
    s_scores = np.arange(0.01, 1, 0.01)
    P_scores = [probability(s, r, b) for s in s_scores]
    probs = probs.append(pd.DataFrame({
        'P': P_scores,
        's': s_scores,
        'b': [str(b)]*len(s_scores)
    }), ignore_index=True)

sns.lineplot(data=probs, x='s', y='P', hue='b')

In [None]:
for b in [50, 25,20, 15,10,5]:
    r = int(100/b)    
    print("Threshold for band ",b,"and row ",r,"is given as", (1/b)**(1/r))

In [None]:
b = 20
r = 5
s_scores = np.arange(0.01, 1, 0.01)
P_scores = [probability(s, r, b) for s in s_scores]

graph = sns.lineplot(x=s_scores, y=P_scores)
graph = sns.scatterplot(data=pairs, x='cosine', y='candidate', alpha=0.1, color='k')
graph.axhline((1/b)**(1/r), color='red')
# graph.axvline((1/b)**(1/r), color='red',)

# Vader Model

In [None]:
#!pip install vaderSentiment
Lemmatize.head()

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# function to calculate vader sentiment  
def vadersentimentanalysis(review):
    vs = analyzer.polarity_scores(review)
    return vs['compound']


def vader_analysis(compound):
    if compound >= 0.5:
        return 1
    elif compound <= -0.5 :
        return -1
    else:
        return 0
    

mydata['Vader Sentiment'] = Lemmatize['Lemma'].apply(vadersentimentanalysis)
mydata['VaderAnalysis'] = mydata['Vader Sentiment'].apply(vader_analysis)
mydata.head()

In [None]:
tb_counts = mydata["VaderAnalysis"].value_counts()
tb_counts

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

tb_count= mydata["VaderAnalysis"].value_counts()
plt.figure(figsize=(10, 7))
plt.pie(tb_counts.values, labels = ["Satisfied","Neutral","Unsatisfied"], explode = (0.15, 0, 0.25), autopct='%1.1f%%', shadow=False)
plt.show()

# Logistic Regression

In [None]:
y = mydata["stars"]
Y = []
column = list(y)
for i in column:
    if i < 3:
        Y.append(-1)
    elif i == 3:
        Y.append(0)
    else:
        Y.append(1)
    


In [None]:
y = Y
X = Lemmatize["Lemma"]

In [None]:
vect = CountVectorizer(ngram_range = (1,1), max_df = .95, min_df = 10)
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size= 0.1)

In [None]:
vect.fit(X_train)
X_train_dtm = vect.transform(X_train) 
X_test_dtm = vect.transform(X_test)

In [None]:
LR = LogisticRegression()
LR.fit(X_train_dtm, y_train)
y_pred = LR.predict(X_test_dtm)

## Performace Measure

### Accuracy Score

In [None]:
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')

### Confusion Matrix

In [None]:
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

### Classification Report

In [None]:
print("Recall Score: ", metrics.classification_report(y_test, y_pred))

In [None]:
def predict(text):
    text = clean(text)
    text = pd.DataFrame([text])
    text = text[0].apply(token_stop_pos)
    text = text.apply(lemmatize)
    text_dtm = vect.transform(text)
    pred = LR.predict(text_dtm)
#     print(pred)
    if pred[0] == 1 :
        return "Customer is Satisfied"
    if pred[0] == 0:
        return "Customer Comment is Neutral"
    if pred[0] == -1 :
        return "Customer is Unsatisfied"

In [None]:
predict("Jio is fairly good" )

In [None]:
predict("Jio network speed is mediocre" )

In [None]:
predict("Jio is worst" )

# Year Wise Analysis

In [None]:
year_summary = pd.read_csv('Outlier_removed.csv', usecols=['Year','stars'])

In [None]:
dataYear = year_summary['Year']
dataStar = year_summary['stars']

In [None]:
yearly = {}
yearLabels = [2016, 2017, 2018, 2019, 2020, 2021, 2022]
for i in range(len(year_summary)):
    if dataYear[i] in yearly:
        if dataStar[i] > 3:
            yearly[dataYear[i]][0] +=1
        elif dataStar[i] < 3 :
            yearly[dataYear[i]][1] +=1
        else:
            yearly[dataYear[i]][2] +=1
    else:
        yearly[dataYear[i]] = [0,0,0]
        if dataStar[i] > 3:
            yearly[dataYear[i]][0] +=1
        elif dataStar[i] < 3 :
            yearly[dataYear[i]][1] +=1
        else:
            yearly[dataYear[i]][2] +=1

In [None]:
yearly

In [None]:
for i in yearly:
    sum = 0
    for j in yearly[i]:
        sum+=j
    yearly[i][0]/=sum
    yearly[i][1]/=sum    
    yearly[i][2]/=sum
yearly

In [None]:
labels = ['Satisfied', 'Unsatisfied', 'Neutral']
def showPie(i):
    print("Statistics Observed in the Year ", i)
    plt.pie(yearly[i], labels = labels, autopct='%1.2f%%' ,normalize = True)

In [None]:
showPie(2016)

In [None]:
showPie(2017)

In [None]:
showPie(2018)

In [None]:
showPie(2019)

In [None]:
showPie(2020)

In [None]:
showPie(2021)

In [None]:
showPie(2022)

# Show Variation in the Sentiment throughout the Years

In [None]:
def showperYear(j):
    posperYear = []
#     j = 0
    for i in yearly:
        posperYear.append(yearly[i][j])
    return posperYear


def sentimentPerYear(j):
    plt.plot(yearLabels, showperYear(j))
    plt.xlabel("Variation Over the years")
    plt.ylabel("Sentiment towards the network")
    plt.show()
    
def showAllSentimentPerYear():
    plt.plot(yearLabels, showperYear(0), label='Satisfied')
    plt.plot(yearLabels, showperYear(1), label='Unsatisfied')
    plt.plot(yearLabels, showperYear(2), label='Neutral')
    plt.xlabel("Variation Over the years")
    plt.ylabel("Sentiment towards the network")
    plt.legend()
    plt.show()

In [None]:
showAllSentimentPerYear()

# State Wise Analysis


In [None]:
myddata = pd.read_csv("Outlier_removed.csv")
# mydata.dropna(subset=["State"], inplace = True)

In [None]:
mydata['State']

In [None]:
myddata

In [None]:
import category_encoders as ce
import pandas as pd


encoder=ce.OneHotEncoder(cols='State',handle_unknown='return_nan',return_df=True,use_cat_names=True)


data2=myddata["State"]
data1 = mydata["VaderAnalysis"]
encoder=ce.HashingEncoder(cols='State',n_components=10)
data2.head()

In [None]:
data1

In [None]:
temp = list(data2)
State_dictionary={}
i = 0 
for row in data2:
    if row in State_dictionary :
        if data1[i] == 1:
            State_dictionary[row][0] += 1
        elif data1[i] == -1 :
            State_dictionary[row][1] += 1
        else :
            State_dictionary[row][2] += 1
    else:
        State_dictionary[row] = [0,0,0]
        if data1[i] == 1:
            State_dictionary[row][0] += 1
        elif data1[i] == -1 :
            State_dictionary[row][1] += 1
        else :
            State_dictionary[row][2] += 1
    i += 1



In [None]:
State_dictionary

In [None]:
for key in State_dictionary:
    sum = 0
    temp = State_dictionary[key]
    for j in State_dictionary[key]:
        sum+=int(j)
    State_dictionary[key] = [ float(i/sum)*100 for i in temp]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
plt.rcParams["figure.figsize"] = [15, 15]
plt.rcParams["figure.autolayout"] = True
plt.xlim(0, 120)
for x in State_dictionary:
    y = State_dictionary[x]
    plt.barh(x, y[0], color='c' , label='Satisfied')
    plt.barh(x, y[1], left=y[0], color='r',label='Unsatisfied')
    plt.barh(x, y[2], left=y[0]+y[1], color='g',label='Neutral')
plt.title("State vs Probability of Satisfaction")
plt.xlabel("Probability")
plt.ylabel("Sates")

plt.show()