# Sentiment analysis

##  Set up

In [2]:
#%pip install numpy pandas nltk sklearn

In [1]:
import numpy as np 
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re

## Load dataset

In [2]:
df = pd.read_csv(r'../resources/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


16123
12853

## Tiền xử lý dữ liệu

### Làm sạch dữ liệu 
- Loại bỏ các thẻ HTML
- Loại bỏ khoảng trắng thừa và dấu câu 
- Chuyển đổi chữ hoa thành chữ thường 

#### Loại bỏ thẻ HTML

In [3]:
df['review'] = df['review'].str.replace("<br />", "")
df['review'][2]

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

#### Loại bỏ khoảng trắng thừa và dấu câu

In [4]:
# Hàm để chuyển các ký tự đặc biệt thành khoảng trắng và loại bỏ khoảng trắng thừa
def remove_punctuation(text):
    # Chuyển các ký tự đặc biệt thành khoảng trắng
    text = re.sub("n\'t", " not", text)
    text = re.sub("\'s", " is", text)
    text = re.sub("\'re", " are", text)
    text = re.sub("\'m", " am", text)
    text = re.sub(r'[^\w\s]', ' ', text)
    # Loại bỏ các khoảng trắng thừa
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [5]:
df['review'] = df['review'].apply(remove_punctuation)

#### Loại bỏ stop words

In [6]:
# Tải stop words từ nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PAVT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
stop_words.remove('no')
stop_words.remove('not')

In [8]:
def remove_stop_words(text):
    words = text.split()
    filter_words = [word for word in words if word not in stop_words]
    return ' '.join(filter_words)

In [9]:
df['review'] = df['review'].apply(remove_stop_words)

#### Chuyển chữ hoa thành chữ thường 

In [10]:
df['review'] = df['review'].str.lower()


## Chia dữ liệu

In [11]:
# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
from sklearn.model_selection import train_test_split
df_removed = df['review'].apply(remove_stop_words)
X_train, X_test, y_train, y_test = train_test_split(df_removed, df['sentiment'], test_size=0.3, random_state=42)

## Word embeddings

In [12]:
from gensim.models import Word2Vec

In [13]:
sentences = []
for sentence in X_train:
    sentences.append(sentence.split())

In [14]:
model = Word2Vec(sentences, vector_size=200, window=5, sg=1, min_count=5, workers = 80)

#### sentence embeddings

In [15]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [16]:
def sum_weights(vectors, w):
    return np.matmul(w.T, vectors)

In [17]:
def softmax(arr):
    sum_exp = np.sum(np.exp(arr))
    weights = np.zeros(len(arr))
    for i in range(len(arr)):
        weights[i] = np.exp(arr[i])/sum_exp
    return weights

Các tiêu chí đánh giá phim: 
- Kịch bản: 
    + coherent/ incoherent
    + unpredictable/ predictable
- Ý nghĩa phim: meaningful/ meaningless
- Hiệu ứng: impressive / unimpressive
- Cảnh quay: heartfelt / insincere

In [18]:
# list_positive_words = ['heartfelt', 'impress', 'excel', 'meaningful', 'cinematograph']
# list_negative_words = ['dumbsh', 'incoherent', 'dull', 'meaningless', 'ruin']
# list_positive_words = ['heartfelt', 'impress', 'excel', 'meaningful', 'cinematograph']
# list_negative_words = ['dumbsh', 'incoher', 'dull', 'meaningless', 'ruin']

list_positive_words = ['heartfelt', 'gripping', 'impressive', 'meaningful', 'coherent']
list_negative_words = ['insincere', 'predictable', 'soporific', 'illogical', 'uninteresting']

# incoherent == uninteresting
# meaningless == illogical
# unimpressive == soporific
# unpredictable == gripping


In [19]:
for word in list_positive_words:
    print(model.wv.most_similar(word))

print()
for word in list_negative_words:
    print(model.wv.most_similar(word))

[('heartwarming', 0.7796937227249146), ('jerker', 0.7677770853042603), ('bittersweet', 0.7644371390342712), ('poignantly', 0.7643495202064514), ('wistful', 0.7588487267494202), ('poignant', 0.7587066292762756), ('heartache', 0.7574900984764099), ('tugs', 0.7542299628257751), ('joyful', 0.7516697645187378), ('uplifting', 0.7513870000839233)]
[('taut', 0.7647790908813477), ('engrossing', 0.741847038269043), ('riveting', 0.7185558676719666), ('thrilling', 0.707617461681366), ('twisty', 0.7049236297607422), ('enthralling', 0.6958408355712891), ('exhilarating', 0.6920657753944397), ('harrowing', 0.6894348859786987), ('arresting', 0.6847682595252991), ('suspenseful', 0.677588701248169)]
[('spectacular', 0.6560789346694946), ('astonishing', 0.6420767903327942), ('impressively', 0.6292502880096436), ('astounding', 0.6255952715873718), ('awsome', 0.624431848526001), ('exceptional', 0.6191527247428894), ('breathtaking', 0.6179165244102478), ('unimpressive', 0.6158871054649353), ('amazing', 0.611

- extraordinary/vivid >< dumbsh

In [20]:
def normalize(v):
    return v/np.linalg.norm(v)

In [21]:
list_positive = np.array([ normalize(model.wv[word]) for word in list_positive_words ])
list_negative = np.array([ normalize(model.wv[word]) for word in list_negative_words ])

In [22]:
for i in range(len(list_positive)):
    print( np.dot(list_positive[i], list_negative[i]))

0.6251288
0.41655207
0.54504883
0.46773383
0.5267204


In [23]:
positive_vector = normalize( sum_weights(list_positive, np.array([1,2,1,2,1]) ) )
negative_vector = normalize( sum_weights(list_negative, np.array([1,2,1,2,1]) ) )

In [24]:
def sentence_to_vector(sentence, model):
    word_vectors = np.array([model.wv[word] for word in sentence if word in model.wv]) # size T*vector_size
    pos_weights = []
    neg_weights = []
    res = np.zeros(model.vector_size)
    
    if len(word_vectors) == 0:
        return res
    pos_weights = np.matmul(word_vectors, positive_vector.T)
    neg_weights = np.matmul(word_vectors, negative_vector.T)
    pos_res = sum_weights(word_vectors, softmax(pos_weights) )
    neg_res = sum_weights(word_vectors, softmax(neg_weights) )
    return np.concatenate([pos_res, neg_res])

#### Tạo tập train và test cho model

In [25]:
X_w2v_train = [ sentence_to_vector(sent.split(), model)  for sent in X_train]
X_w2v_test =  [ sentence_to_vector(sent.split(), model)  for sent in X_test]

## Mô hình
chỉ chạy để kiểm thử model Word2Vec

### Hàm khảo sát

In [26]:
from sklearn.metrics import accuracy_score, classification_report
def train_and_valid(model, X_train, y_train, X_test, y_test):
    model.fit(X_train,y_train)
    y_pred_train = model.predict(X_train) 
    y_pred = model.predict(X_test)
    # Đánh giá mô hình
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred)
    print(f"training accuracy: {accuracy_train} \nvalidation accuracy: {accuracy_test}")
    

### Decision tree

In [27]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=42, max_depth=7)
train_and_valid(dt_clf, X_w2v_train,y_train, X_w2v_test, y_test)

training accuracy: 0.8305142857142858 
validation accuracy: 0.7903333333333333


### Logistic regression


In [None]:
{'C': 3.2420540279940164, 'max_iter': 2000, 'penalty': None, 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.001040752833059717}


In [33]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=42, max_iter=4000, penalty= None, tol = 0.001040752833059717 )
train_and_valid(log_reg, X_w2v_train, y_train, X_w2v_test, y_test)

training accuracy: 0.8994857142857143 
validation accuracy: 0.8911333333333333


### Bảng tổng kết logistic regression với model Word2Vec
Dataset: 
- Loại bỏ các thẻ HTML
- Loại bỏ khoảng trắng thừa và dấu câu 
- Chuyển đổi chữ hoa thành chữ thường 
- Loại bỏ stopwords 

| Tham số Word2Vec | Tham số logistic regression | training accuracy | testing accuracy | 
|----------------|---------------|---------|---------|
|vector_size = 100, window = 5, min_count = 5 | max_iter = 600 | 0.878 | 0.876 |
|vector_size = 200, window = 5, min_count = 5 | max_iter = 4000 | 0.884 | 0.882 |
|vector_size = 200, sg = 1, window = 5, min_count = 5 | max_iter = 3000 | 0.897 | 0.8899 |
|vector_size = 200, sg = 1, window = 5, min_count = 5 | max_iter = 2000, C = 1.2 | 0.8974 | 0.89 |
|vector_size = 300, sg = 1, window = 5, min_count = 5 | max_iter = 4000, C = 0.9 | 0.898 | 0.8934 |
|vector_size = 1000, sg = 1, window = 5, min_count = 5 | max_iter = 4000, C = 0.9 | 0.9095 | 0.8947 |

In [29]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform

param_dist = {
    'C': uniform(0.5, 20),
    'tol': uniform(1e-4, 1e-2),
    'solver': ['liblinear', 'saga', 'sag', 'newton-cg', 'newton-cholesky', 'lbfgs'],
    'max_iter': [500, 1000, 2000, 4000],
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'random_state' : [None, 42]
}

random_search = RandomizedSearchCV(LogisticRegression(), param_distributions=param_dist, n_iter=100, cv=5)
random_search.fit(X_w2v_train, y_train)
print(random_search.best_params_)




KeyboardInterrupt: 

In [None]:
{'C': 3.2420540279940164, 'max_iter': 2000, 'penalty': None, 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.001040752833059717}

### Random forest

### MLP

### Ensemble model

[0.5,1.5,0.75,0.75,1.5] --> 0.855 \
weights --> 0.8466

## Tổng kết

| Mô hình Word Embeddings| Dữ liệu |Mô hình ML| Tham số | training accuracy | testing accuracy | Đánh giá | 
|---------|------|----------|------|--------|----|---|
|Word2Vec|Không loại stop words |Decision tree | default | 0.979 | 0.7249 | Overfitting |
|Word2Vec|Không loại stop words |Logistic Regression | max_iter = 400 | 0.852 | 0.850 | -- |
|Word2Vec|Không loại stop words |Random forest | default | 0.99997 | 0.818 | Ovefitting |
|Word2Vec|Không loại stop words  |XGBoost | default | 0.961 | 0.833 | Ovefitting |
|Word2Vec|Không loại stop words |MLP | max_iter = 250, learning_rate_init = 0.0005 | 0.868 | 0.850 | -- |
|Word2Vec|Loại stop words |Decision tree | default | 0.976 | 0.751 | Overfitting |
|Word2Vec|Loại stop words |Logistic Regression | default | 0.850 | 0.848 | -- |
|Word2Vec|Loại stop words |Random forest | default | 0.999 | 0.830 | Overfitting |
|Word2Vec|Loại stop words  |XGBoost | default | 0.964 | 0.834 | Ovefitting |
|Word2Vec|Loại stop words |MLP | default | 0.938 | 0.816 | Overfitting |
