In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('reviews_mixed.csv')
df.head()

Unnamed: 0,Text,Sentiment
0,"The rooms are extremely small, practically onl...",negative
1,Room safe did not work.,negative
2,Mattress very comfortable.,positive
3,"Very uncomfortable, thin mattress, with plasti...",negative
4,No bathroom in room,


In [3]:
df.dropna(inplace=True)

In [4]:
df.replace(to_replace ="negative", 
                 value ="0",inplace=True)
df.replace(to_replace ="positive", 
                 value ="1",inplace=True) 

In [5]:
df

Unnamed: 0,Text,Sentiment
0,"The rooms are extremely small, practically onl...",0
1,Room safe did not work.,0
2,Mattress very comfortable.,1
3,"Very uncomfortable, thin mattress, with plasti...",0
5,The bed was soooo comfy.,1
...,...,...
289,No room service.,1
297,ALTHOUGHT i WOULD LIKE TO MAKE A SPECIAL MENTI...,positivepositive
303,While the streets are rather busy during the d...,1
304,Very close to Times Square,1


In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df['Text'],df['Sentiment'],random_state=0)

In [7]:
print(X_train[0])
print(X_train.shape)

The rooms are extremely small, practically only a bed.
(185,)


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer().fit(X_train)  
#Count Vectorizer does not look after the structure of th words or the sequence in which they occur

In [9]:
vect.get_feature_names()[::30]        #every 30 words from the corpus

['12am',
 'amazing',
 'bathroom',
 'cables',
 'comforts',
 'desk',
 'engineer',
 'fine',
 'greatest',
 'hvac',
 'lengths',
 'mattress',
 'nyc',
 'place',
 'reception',
 'selection',
 'smell',
 'streets',
 'through',
 'two',
 'warmer',
 'you']

In [10]:
#Count Vectorizer builds a vocbulary of all the words in the corpus 
#A matrix is formed which consists of the number of times the words appear in a particular document
len(vect.get_feature_names())            #total number of tokens

634

In [11]:
X_train_vect=vect.transform(X_train)
X_train_vect                  
#each document is the row and the columns are all the tokens in the corpus
#the matrix is formed with the columns as 0 and 1 indicating the presence or absence of the word in the row or doc
#hence most of the matrix is 0 and it is sparse

<185x634 sparse matrix of type '<class 'numpy.int64'>'
	with 1635 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train_vect,y_train)

#import logging as log
#from sklearn.metrics import accuracy_score
#y_pred = log.predict(x_test)
#score =accuracy_score(y_test,y_pred)
#The accuracy is around 85%

LogisticRegression()

In [13]:
feature_names=np.array(vect.get_feature_names())
sorted_coef_index=model.coef_[0].argsort()
print('smallest\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('largest\n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))

#coeff says that if one variable goes up, the other one goes lower
#hence small variables give +ve words which give +ve reviews or 0 as the review
#same for large coeff the words are that which give the reviews as 1

smallest
['comfortable' 'location' 'subway' 'service' 'spacious' 'nice' 'is'
 'comfy' 'close' 'facilities']

largest
['small' 'in' 'not' 'noisy' 'door' 'uncomfortable' 'there' 'cold' 'could'
 'no']



In [16]:
print(model.predict(vect.transform(['big insects','spacious rooms'])))
print(model.predict(vect.transform(['rat in the room','absymal bed'])))
print(model.predict(vect.transform(['doors to outside of hotel did not lock anymore','About 20 minutes walk to Times Square'])))


['0' '1']
['0' '0']
['0' '1']


In [17]:
print(model.predict(vect.transform(['smaller beds','clean washrooms'])))

['0' '0']
