## <center> Import Libs

In [1]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")
import pandas as pd

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/andrejbaranov/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## <center> Import Data

In [3]:
data = pd.read_csv('data/reviews.csv')
data.head()

Unnamed: 0,Review
0,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso..."


## <center> Data Info

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
dtypes: object(1)
memory usage: 160.2+ KB


In [5]:
data.describe()

Unnamed: 0,Review
count,20491
unique,20491
top,nice hotel expensive parking got good deal sta...
freq,1


## <center> Null Data / Duplicates

In [6]:
data.isnull().sum()

Review    0
dtype: int64

In [8]:
print(f"Duplicated data: {data.duplicated().sum()}")

Duplicated data: 0


## <center> Model build & predict

In [9]:
sentiments = SentimentIntensityAnalyzer()

data["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in data["Review"]]
data["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in data["Review"]]
data["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in data["Review"]]
data['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in data["Review"]]

data.head()

Unnamed: 0,Review,Positive,Negative,Neutral,Compound
0,nice hotel expensive parking got good deal sta...,0.285,0.072,0.643,0.9747
1,ok nothing special charge diamond member hilto...,0.189,0.11,0.701,0.9787
2,nice rooms not 4* experience hotel monaco seat...,0.219,0.081,0.7,0.9889
3,"unique, great stay, wonderful time hotel monac...",0.385,0.06,0.555,0.9912
4,"great stay great stay, went seahawk game aweso...",0.221,0.135,0.643,0.9797


## <center> Creating Categorical feature to consolidate output

In [10]:
score = data["Compound"].values

sentiment = []

for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')

data["Sentiment"] = sentiment

data.head()

Unnamed: 0,Review,Positive,Negative,Neutral,Compound,Sentiment
0,nice hotel expensive parking got good deal sta...,0.285,0.072,0.643,0.9747,Positive
1,ok nothing special charge diamond member hilto...,0.189,0.11,0.701,0.9787,Positive
2,nice rooms not 4* experience hotel monaco seat...,0.219,0.081,0.7,0.9889,Positive
3,"unique, great stay, wonderful time hotel monac...",0.385,0.06,0.555,0.9912,Positive
4,"great stay great stay, went seahawk game aweso...",0.221,0.135,0.643,0.9797,Positive


## <center> Created Data info

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Review     20491 non-null  object 
 1   Positive   20491 non-null  float64
 2   Negative   20491 non-null  float64
 3   Neutral    20491 non-null  float64
 4   Compound   20491 non-null  float64
 5   Sentiment  20491 non-null  object 
dtypes: float64(4), object(2)
memory usage: 960.6+ KB


In [12]:
data['Sentiment'].value_counts()

Positive    18831
Negative     1569
Neutral        91
Name: Sentiment, dtype: int64

## <center> Save data

In [13]:
data.to_csv('data/sentiments.csv', index = False)