# Starter Notebook
Rename notebook to the task (e.g. sentiment_analysis)

In [1]:
# Required libraries in Colab
# ! pip install transformers sentencepiece --quiet

In [2]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm, tqdm_pandas

import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)
import plotly.express as px
from wordcloud import WordCloud

from nltk.corpus import PlaintextCorpusReader, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer


from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer



## 1. Load data

### 1.1 Import data

In [3]:
# Import data
df = pd.read_pickle('../data/processed_reviews.pkl')

df.head()

Unnamed: 0,Review_ID,Sentence_ID,Review_Year,Review_Month,Branch,Rating,Reviewer_Location,Review_Title,Review_Text
0,1,1,2023,9,Disneyland_Tokyo,5,"Johor Bahru, Malaysia",Worth every penny and every minute,I visited Disney Land Tokyo with my family on a weekend night in December 2022.
1,1,2,2023,9,Disneyland_Tokyo,5,"Johor Bahru, Malaysia",Worth every penny and every minute,We bought the evening entry that allowed us to enter the park after 3 p.m. at a discounted rate.
2,1,3,2023,9,Disneyland_Tokyo,5,"Johor Bahru, Malaysia",Worth every penny and every minute,"We thought it was a great deal because we could still enjoy most of the attractions, parades, and shows without spending too much time or money."
3,1,4,2023,9,Disneyland_Tokyo,5,"Johor Bahru, Malaysia",Worth every penny and every minute,We arrived at the park around 4 p.m. and headed straight to Tokyo Disneyland.
4,1,5,2023,9,Disneyland_Tokyo,5,"Johor Bahru, Malaysia",Worth every penny and every minute,We were amazed by the beautiful decorations and the festive atmosphere.


### 1.2 Vectorize
This incorporates the text preprocessing steps in `text_preprocessing.py`

In [4]:
from text_preprocessing import text_preprocessing

column_names = ["Review_Text"]  # each column will take around 30s to process

# For storing the TF-IDF vectors
tfidf_vectors_dict = {}

# For storing the features (words)
feature_names_dict = {}

for column_name in column_names:

    # Vectorize
    vectorizer = TfidfVectorizer(analyzer = lambda x: text_preprocessing(x,
                                                                        stemmer = SnowballStemmer(language='english'),
                                                                        # lemmatizer = WordNetLemmatizer(),
                                                                        stopword_list = stopwords.words('english')
                                                                        ),
                                max_df = 0.5,     # cannot appear in more than half of the docs
                                min_df = 2        # must appear in at least 2 docs
                                )

    X = vectorizer.fit_transform(df[column_name])
    tfidf_vectors_dict[f'{column_name}_tfidf'] = X

    feature_names = vectorizer.get_feature_names_out()
    feature_names_dict[f'{column_name}_features'] = feature_names

    print("n_samples: %d, n_features: %d" % X.shape)
    print(f"features: {feature_names}")


    # Bag of Words
    df[f"{column_name}_BOW"] =  df[column_name].apply(lambda x : text_preprocessing(x,
                                                                                    stemmer = SnowballStemmer(language='english'),
                                                                                    # lemmatizer = WordNetLemmatizer(),
                                                                                    stopword_list = stopwords.words('english')))



df.head(5)

n_samples: 88918, n_features: 6343
features: ['aa' 'abandon' 'abat' ... 'zone' 'zoom' 'zootopia']


Unnamed: 0,Review_ID,Sentence_ID,Review_Year,Review_Month,Branch,Rating,Reviewer_Location,Review_Title,Review_Text,Review_Text_BOW
0,1,1,2023,9,Disneyland_Tokyo,5,"Johor Bahru, Malaysia",Worth every penny and every minute,I visited Disney Land Tokyo with my family on a weekend night in December 2022.,"[visit, disney, land, tokyo, famili, weekend, night, decemb]"
1,1,2,2023,9,Disneyland_Tokyo,5,"Johor Bahru, Malaysia",Worth every penny and every minute,We bought the evening entry that allowed us to enter the park after 3 p.m. at a discounted rate.,"[bought, even, entri, allow, us, enter, park, three, discount]"
2,1,3,2023,9,Disneyland_Tokyo,5,"Johor Bahru, Malaysia",Worth every penny and every minute,"We thought it was a great deal because we could still enjoy most of the attractions, parades, and shows without spending too much time or money.","[thought, great, deal, could, still, enjoy, show, without, spend, much, time]"
3,1,4,2023,9,Disneyland_Tokyo,5,"Johor Bahru, Malaysia",Worth every penny and every minute,We arrived at the park around 4 p.m. and headed straight to Tokyo Disneyland.,"[arriv, park, around, four, head, straight, tokyo]"
4,1,5,2023,9,Disneyland_Tokyo,5,"Johor Bahru, Malaysia",Worth every penny and every minute,We were amazed by the beautiful decorations and the festive atmosphere.,"[amaz, beauti, decor, festiv]"


In [6]:
# TF-IDF vectors for Review_Text
tfidf = tfidf_vectors_dict['Review_Text_tfidf']

tfidf

<88918x6343 sparse matrix of type '<class 'numpy.float64'>'
	with 599376 stored elements in Compressed Sparse Row format>