# 4.5 Practical Task

In [None]:
import pandas as pd
import numpy as np
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import transformers
from transformers import pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from itertools import chain
from nltk import NaiveBayesClassifier

### Load data

In [None]:
data = pd.read_csv("book_reviews_sample.csv")
data.head()

In [None]:
data.info()

In [None]:
data['reviewText'][0]

### Clean data

In [None]:
# lowercase
data['reviewText_clean'] = data['reviewText'].str.lower()

In [None]:
# remove punctuation
data['reviewText_clean'] = data.apply(lambda x: re.sub(r"([^\w\s])", "", x['reviewText_clean']), axis=1)

In [None]:
data.head()

### Rule-based sentiment with VADER

In [None]:
vader_sentiment = SentimentIntensityAnalyzer()

In [None]:
data['vader_sentiment_score'] = data['reviewText_clean'].apply(lambda review: vader_sentiment.polarity_scores(review)['compound'])

In [None]:
# create labels
bins = [-1, -0.1, 0.1, 1]
names = ['negative', 'neutral', 'positive']

data['vader_sentiment_label'] = pd.cut(data['vader_sentiment_score'], bins, labels=names)

In [None]:
data['vader_sentiment_label'].value_counts().plot.bar()

### Pre-trained Transformer Model

In [None]:
transformer_pipeline = pipeline("sentiment-analysis")

In [None]:
transformer_labels = []

for review in data['reviewText_clean'].values:
    sentiment_list = transformer_pipeline(review)
    sentiment_label = [sent['label'] for sent in sentiment_list]
    transformer_labels.append(sentiment_label)
    
data['transformer_sentiment_label'] = transformer_labels

In [None]:
data['transformer_sentiment_label'].value_counts().plot.bar()