In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


In [4]:
df["Rating"].value_counts()

5    9054
4    6039
3    2184
2    1793
1    1421
Name: Rating, dtype: int64

In [5]:
df_neg = df.loc[df["Rating"] < 3]
df_neg = df_neg.reset_index(drop = True)

In [6]:
df_five = df.loc[df["Rating"] == 5]
df_five = df_five.reset_index(drop = True)

In [7]:
print(len(df_neg))
print(len(df_five))

3214
9054


In [8]:
df_pos = df_five.loc[:len(df_neg)]


In [9]:
print(len(df_pos))

3215


In [10]:
df_all = pd.concat([df_neg,df_pos], axis = 0)
df_all = df_all.reset_index(drop = True)

In [11]:
df_all.head()
df_all.tail()

Unnamed: 0,Review,Rating
6424,perfect hotel hotel does not really need glowi...,5
6425,perfect hotel small hotel comfortable perfect ...,5
6426,ordinary location extraordinary hotel know lov...,5
6427,"classy indulgence awesome experience, staff n'...",5
6428,first-rate experience stay library hotel wife ...,5


In [12]:
print(len(df_all))

6429


## Creating a Sentiments for 5* and 1*,2*

In [13]:
df_all["Sentiment"] = np.where(df_all["Rating"] == 5, "Positive" , "Negative")

In [14]:
df_all.head()


Unnamed: 0,Review,Rating,Sentiment
0,ok nothing special charge diamond member hilto...,2,Negative
1,"poor value stayed monaco seattle july, nice ho...",2,Negative
2,horrible customer service hotel stay february ...,1,Negative
3,disappointed say anticipating stay hotel monac...,2,Negative
4,great location need internally upgrade advanta...,2,Negative


## Randomize cells

In [15]:
df_all = df_all.sample(frac = 1)
df_all = df_all.reset_index(drop = True)

In [16]:
df_all.head(10)

Unnamed: 0,Review,Rating,Sentiment
0,"incredible, husband 20 year old daughter 20 ye...",5,Positive
1,"not expected, excited come puerto rico stay wo...",2,Negative
2,vastly overrated seen better days stayed krasn...,2,Negative
3,perfect gentle start trip needed stay boston n...,5,Positive
4,disapointed room hotel la alamania located 5 8...,2,Negative
5,outstanding location just got week florence fi...,5,Positive
6,"wonderful resort, husband celebrated birthday ...",5,Positive
7,disappointing good location not expensive hote...,2,Negative
8,truely wonderful place husband university inn ...,5,Positive
9,excellent boston digs absolutely recommend hot...,5,Positive


## Splitting into training and test examples
### x column je sloupec Review a y column je Sentiment (ostatni sloupce uz nepotrebujeme)

In [17]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_all.Review, df_all.Sentiment)

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
x_train_vec = v.fit_transform(x_train)
x_test_vec = v.transform(x_test)

## Usage of classification model

In [19]:
from sklearn import svm
clf_svm = svm.SVC(kernel = "linear")
clf_svm.fit(x_train_vec, y_train)

In [20]:
cpf_svm = svm.SVC(kernel = "poly")
cpf_svm.fit(x_train_vec, y_train)

In [21]:
crf_svm = svm.SVC(kernel = "rbf")
crf_svm.fit(x_train_vec, y_train)

## Accuracy test

In [27]:
clf_svm.score(x_test_vec, y_test)

0.9465174129353234

In [25]:
cpf_svm.score(x_test_vec, y_test)

0.7027363184079602

In [26]:
crf_svm.score(x_test_vec, y_test)

0.9347014925373134

In [28]:
from sklearn.metrics import f1_score
f1_score(y_test, clf_svm.predict(x_test_vec), average = None)

array([0.94444444, 0.94844125])

In [27]:
from sklearn.metrics import f1_score
f1_score(y_test, cpf_svm.predict(x_test_vec), average = None)

array([0.59831933, 0.76406713])

In [28]:
from sklearn.metrics import f1_score
f1_score(y_test, crf_svm.predict(x_test_vec), average = None)

array([0.93546404, 0.9339207 ])

In [29]:
rev = ["Absolutely loved this place"]
rev_vec = v.transform(rev)
clf_svm.predict(rev_vec)

array(['Positive'], dtype=object)

In [30]:
rev = ["Absolutely loved this place"]
rev_vec = v.transform(rev)
cpf_svm.predict(rev_vec)

array(['Positive'], dtype=object)

In [31]:
rev = ["Absolutely loved this place"]
rev_vec = v.transform(rev)
crf_svm.predict(rev_vec)

array(['Positive'], dtype=object)

In [32]:
rev = ["Absolutely hated this place"]
rev_vec = v.transform(rev)
clf_svm.predict(rev_vec)

array(['Negative'], dtype=object)

In [33]:
rev = ["Absolutely hated this place"]
rev_vec = v.transform(rev)
cpf_svm.predict(rev_vec)

array(['Positive'], dtype=object)

In [34]:
rev = ["Absolutely hated this place"]
rev_vec = v.transform(rev)
crf_svm.predict(rev_vec)

array(['Negative'], dtype=object)