# Sentimental Analysis on company reviews

In [1]:
#importing the dependencies
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
df=pd.read_csv("company_reviews.csv",encoding="ISO-8859-1")
df

Unnamed: 0,name,rating,reviews_description
0,Sitel,,"Sitel Groupâs 75,000 people across the globe..."
1,Meadowbrook Rehabilitation,3.7,You'll work with the most experienced and loya...
2,Intermountain,4.0,Why Intermountain?\n\nWe Bring Hope\n\nWith ou...
3,Smith & Nephew,,It's more than business at Smith+Nephew - it's...
4,Reverse Mortgage Funding,4.1,Reverse Mortgage Funding LLC is committed to e...
...,...,...,...
17045,Billion Automotive,3.2,"Billion Automotive, a family owned business si..."
17046,ScienceLogic Inc,3.0,ScienceLogic is a leader in IT Operations Mana...
17047,Northland PACE,2.5,Northland PACE Senior Care Services has at its...
17048,Lloyd Staffing,3.9,"At LLoyd, we are not just agents of talent, bu..."


In [4]:
df.isnull().sum()

name                    338
rating                 1434
reviews_description       1
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.isnull().sum()

name                   0
rating                 0
reviews_description    0
dtype: int64

In [10]:
#convert to positive and negative sentiments
from textblob import TextBlob

def classify_sentiment(text):
    blob=TextBlob(text)
    sentiment_score=blob.sentiment.polarity
    if sentiment_score>0:
        return 'Positive'
    elif sentiment_score<0:
        return 'Negative'
    else:
        return 'Neutral'
    
df['Sentiment_category']=df['reviews_description'].apply(classify_sentiment)
df

Unnamed: 0,name,rating,reviews_description,Sentiment_category
1,Meadowbrook Rehabilitation,3.7,You'll work with the most experienced and loya...,Positive
2,Intermountain,4.0,Why Intermountain?\n\nWe Bring Hope\n\nWith ou...,Positive
4,Reverse Mortgage Funding,4.1,Reverse Mortgage Funding LLC is committed to e...,Neutral
5,VeriFone,3.4,Verifone is an American multinational corporat...,Negative
6,VBM Auto Group,3.5,This company is family owned and operated with...,Positive
...,...,...,...,...
17045,Billion Automotive,3.2,"Billion Automotive, a family owned business si...",Positive
17046,ScienceLogic Inc,3.0,ScienceLogic is a leader in IT Operations Mana...,Positive
17047,Northland PACE,2.5,Northland PACE Senior Care Services has at its...,Positive
17048,Lloyd Staffing,3.9,"At LLoyd, we are not just agents of talent, bu...",Positive


In [11]:
df

Unnamed: 0,name,rating,reviews_description,Sentiment_category
1,Meadowbrook Rehabilitation,3.7,You'll work with the most experienced and loya...,Positive
2,Intermountain,4.0,Why Intermountain?\n\nWe Bring Hope\n\nWith ou...,Positive
4,Reverse Mortgage Funding,4.1,Reverse Mortgage Funding LLC is committed to e...,Neutral
5,VeriFone,3.4,Verifone is an American multinational corporat...,Negative
6,VBM Auto Group,3.5,This company is family owned and operated with...,Positive
...,...,...,...,...
17045,Billion Automotive,3.2,"Billion Automotive, a family owned business si...",Positive
17046,ScienceLogic Inc,3.0,ScienceLogic is a leader in IT Operations Mana...,Positive
17047,Northland PACE,2.5,Northland PACE Senior Care Services has at its...,Positive
17048,Lloyd Staffing,3.9,"At LLoyd, we are not just agents of talent, bu...",Positive


In [13]:
import re        #regular expression
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer    #to convert textual data to numerical data
from sklearn.model_selection import train_test_split     #to split the data into training and testing datasets
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Surbhi
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
#printing the stopwords (don't add any influential meaning to the data) in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Data processing

In [16]:
df.shape

(15615, 4)

In [17]:
df.head()

Unnamed: 0,name,rating,reviews_description,Sentiment_category
1,Meadowbrook Rehabilitation,3.7,You'll work with the most experienced and loya...,Positive
2,Intermountain,4.0,Why Intermountain?\n\nWe Bring Hope\n\nWith ou...,Positive
4,Reverse Mortgage Funding,4.1,Reverse Mortgage Funding LLC is committed to e...,Neutral
5,VeriFone,3.4,Verifone is an American multinational corporat...,Negative
6,VBM Auto Group,3.5,This company is family owned and operated with...,Positive


# Stemming

In [22]:
#Stemming->used to reduce the word to its root word
port_stem=PorterStemmer()
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]','',content)                #remove everything that is not alphabet
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content

In [23]:
df['stemmed_content'] = df['reviews_description'].apply(stemming)

In [24]:
df

Unnamed: 0,name,rating,reviews_description,Sentiment_category,stemmed_content
1,Meadowbrook Rehabilitation,3.7,You'll work with the most experienced and loya...,Positive,youllworkwiththemostexperiencedandloyalhealthc...
2,Intermountain,4.0,Why Intermountain?\n\nWe Bring Hope\n\nWith ou...,Positive,whyintermountainwebringhopewithourholisticinte...
4,Reverse Mortgage Funding,4.1,Reverse Mortgage Funding LLC is committed to e...,Neutral,reversemortgagefundingllciscommittedtoenhancin...
5,VeriFone,3.4,Verifone is an American multinational corporat...,Negative,verifoneisanamericanmultinationalcorporationhe...
6,VBM Auto Group,3.5,This company is family owned and operated with...,Positive,thiscompanyisfamilyownedandoperatedwithdealers...
...,...,...,...,...,...
17045,Billion Automotive,3.2,"Billion Automotive, a family owned business si...",Positive,billionautomotiveafamilyownedbusinesssincehasb...
17046,ScienceLogic Inc,3.0,ScienceLogic is a leader in IT Operations Mana...,Positive,sciencelogicisaleaderinitoperationsmanagementp...
17047,Northland PACE,2.5,Northland PACE Senior Care Services has at its...,Positive,northlandpaceseniorcareserviceshasatitscorethe...
17048,Lloyd Staffing,3.9,"At LLoyd, we are not just agents of talent, bu...",Positive,atlloydwearenotjustagentsoftalentbutagentsofch...


In [25]:
df.head()

Unnamed: 0,name,rating,reviews_description,Sentiment_category,stemmed_content
1,Meadowbrook Rehabilitation,3.7,You'll work with the most experienced and loya...,Positive,youllworkwiththemostexperiencedandloyalhealthc...
2,Intermountain,4.0,Why Intermountain?\n\nWe Bring Hope\n\nWith ou...,Positive,whyintermountainwebringhopewithourholisticinte...
4,Reverse Mortgage Funding,4.1,Reverse Mortgage Funding LLC is committed to e...,Neutral,reversemortgagefundingllciscommittedtoenhancin...
5,VeriFone,3.4,Verifone is an American multinational corporat...,Negative,verifoneisanamericanmultinationalcorporationhe...
6,VBM Auto Group,3.5,This company is family owned and operated with...,Positive,thiscompanyisfamilyownedandoperatedwithdealers...


In [27]:
#seperating the data and label
X=df['stemmed_content'].values
Y=df['Sentiment_category'].values

In [28]:
print(X)

['youllworkwiththemostexperiencedandloyalhealthcareprofessionalsatmeadowbrookmanorsincemanyofourteammembershavebeenwithusformorethanyearsyoucancountonhighlydedicatedteam'
 'whyintermountainwebringhopewithourholisticintegratedservicesandrelationshipbasedapproachwehelpfamiliescreateandsustainnurturinghealthyenvironmentswherechildrencanthriveandgrowintermountainisanonprofitagencythathasbeenimpactingthelivesofchildrenandfamiliesforoveryearsourresidentialcampusatsouthlamborninhelenaincludesourintegratedprivateschoolaswellasourstabilizationcenterprovidencehomeatherapeuticgrouphomeintheflatheadvalleyprovidesanothercarecentercommunityclinicsinhelenaandkalispellincludesomeorallofthefollowingoutpatienttherapyoccupationaltherapycasemanagementpsychiatricmedicationmanagementpsychologicaltestingandevaluationschoolbasedservicesandfamilybasedservicesincludingadoptionfostercareandinhometherapeuticcarejointheteamseasonedexpertssupporteageryounginnovatorsherewhereintegrityandhardworkareexpectedofeveryone

In [29]:
print(Y)

['Positive' 'Positive' 'Neutral' ... 'Positive' 'Positive' 'Positive']


In [30]:
#splitting the data to training and test data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [31]:
print(X.shape,X_train.shape,X_test.shape)

(15615,) (12492,) (3123,)


In [32]:
#converting textual data to numerical data                             #frequency of the word checked
vectorizer=TfidfVectorizer()

X_train=vectorizer.fit_transform(X_train)                  #fitting->understanding what kind of data, transforming->converting data to numerical values
X_test=vectorizer.transform(X_test)

In [33]:
print(X_train)

  (0, 7896)	1.0
  (1, 2181)	1.0
  (2, 10075)	1.0
  (3, 7481)	1.0
  (4, 5963)	1.0
  (5, 9225)	1.0
  (6, 11803)	1.0
  (7, 481)	1.0
  (8, 9592)	1.0
  (9, 9612)	1.0
  (10, 12222)	1.0
  (11, 10532)	1.0
  (12, 11497)	1.0
  (13, 6356)	1.0
  (14, 8677)	1.0
  (15, 9530)	1.0
  (16, 9065)	1.0
  (17, 3145)	1.0
  (18, 7747)	1.0
  (19, 1359)	1.0
  (20, 4590)	1.0
  (21, 7314)	1.0
  (22, 9443)	1.0
  (23, 6969)	1.0
  (24, 3999)	1.0
  :	:
  (12467, 11663)	1.0
  (12468, 978)	1.0
  (12469, 11624)	1.0
  (12470, 64)	1.0
  (12471, 9239)	1.0
  (12472, 10912)	1.0
  (12473, 938)	1.0
  (12474, 9566)	1.0
  (12475, 4196)	1.0
  (12476, 8894)	1.0
  (12477, 9890)	1.0
  (12478, 7826)	1.0
  (12479, 2870)	1.0
  (12480, 9078)	1.0
  (12481, 10729)	1.0
  (12482, 7755)	1.0
  (12483, 8374)	1.0
  (12484, 6748)	1.0
  (12485, 1563)	1.0
  (12486, 5664)	1.0
  (12487, 1097)	1.0
  (12488, 1911)	1.0
  (12489, 12362)	1.0
  (12490, 7054)	1.0
  (12491, 9615)	1.0


In [34]:
print(X_test)

  (541, 266)	1.0
  (602, 7610)	1.0
  (900, 7135)	1.0
  (927, 5024)	1.0
  (972, 2199)	1.0
  (1218, 5534)	1.0
  (1257, 6121)	1.0
  (1353, 9855)	1.0
  (1794, 11753)	1.0
  (1878, 7094)	1.0
  (1961, 5122)	1.0
  (1989, 12045)	1.0
  (2021, 2424)	1.0
  (2242, 129)	1.0
  (2246, 11773)	1.0
  (2332, 2559)	1.0


In [35]:
#Training the logistic regression model
model=LogisticRegression(max_iter=1000)     #max_iter=maximum number of times the model has to go through the data

In [37]:
model.fit(X_train,Y_train)

# Model Evaluation

In [38]:
#accuracy score on the data
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(Y_train,X_train_prediction)      #compare the predicted data with the true values

In [39]:
print("Accuracy score on the training data : ",training_data_accuracy)

Accuracy score on the training data :  0.8649535702849824


In [40]:
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(Y_test,X_test_prediction)

In [41]:
print("Accuracy score on the test data : ",test_data_accuracy)

Accuracy score on the test data :  0.8648735190521935


In [42]:
#using Multinomial NB classifier

from sklearn.naive_bayes import MultinomialNB

In [43]:
#training the model
mnb=MultinomialNB()
#fitting the svm for bag of words
mnb_bow=mnb.fit(X_train,Y_train)
print(mnb_bow)
#fitting the svm for tfidf features
mnb_tfidf=mnb.fit(X_train,Y_train)
print(mnb_tfidf)

MultinomialNB()
MultinomialNB()


In [44]:
#PREDICTING THE MODEL
#Predicting the model for bag of words
mnb_bow_predict=mnb.predict(X_test)
print(mnb_bow_predict)
#Predicting the model for tfidf features
mnb_tfidf_predict=mnb.predict(X_test)
print(mnb_tfidf_predict)

['Positive' 'Positive' 'Positive' ... 'Positive' 'Positive' 'Positive']
['Positive' 'Positive' 'Positive' ... 'Positive' 'Positive' 'Positive']


In [45]:
#ACCURACY USING MULTONOMIAL NAIVE BAYES
#Accuracy score for bag of words
mnb_bow_score=accuracy_score(Y_test,mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)
#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(Y_test,mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

mnb_bow_score : 0.8648735190521935
mnb_tfidf_score : 0.8648735190521935
