# Sentiment analysis

##  Set up

In [1]:
%pip install numpy pandas nltk sklearn

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-

In [1]:
import numpy as np 
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string


## Load dataset

In [2]:
df = pd.read_csv(r'../resources/processed_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Chia tập dữ liệu

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train1, X_test1, y_train1, y_test1 = train_test_split(df['review'], df['sentiment'], test_size=0.3, random_state=42)

In [18]:
# Chuyển đổi văn bản thành các đặc trưng số sử dụng TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000,)
X_train_tfidf = vectorizer.fit_transform(X_train1)
X_test_tfidf = vectorizer.transform(X_test1)

In [19]:
X_train_tfidf[0]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 51 stored elements and shape (1, 10000)>

In [20]:
sentiment = y_train1.unique()


## Mô hình SVM

### Dùng siêu tham số

In [21]:
from sklearn.decomposition import TruncatedSVD
# 5. Giảm chiều với Truncated SVD
svd = TruncatedSVD(n_components=2500, random_state=42)
X_train_reduced = svd.fit_transform(X_train_tfidf)
X_test_reduced = svd.transform(X_test_tfidf)

In [22]:
from sklearn.svm import SVC
svm_classifier = SVC(
    C=50, 
    coef0=0.5, 
    gamma=0.01, 
    kernel='poly', 
    degree = 3,
    random_state=42
)

In [23]:
svm_classifier.fit(X_train_reduced,y_train1)

In [24]:
from sklearn.metrics import accuracy_score, classification_report

In [25]:
# Dự đoán trên tập kiểm tra
y_pred_train1 = svm_classifier.predict(X_train_reduced)

# Đánh giá mô hình
accuracy_train1 = accuracy_score(y_train1, y_pred_train1)
report_train1 = classification_report(y_train1, y_pred_train1)

print(f'Accuracy: {accuracy_train1}')
print('Classification Report:')
print(report_train1)

Accuracy: 0.9048857142857143
Classification Report:
              precision    recall  f1-score   support

          -1       0.91      0.89      0.90     17589
           1       0.90      0.92      0.91     17411

    accuracy                           0.90     35000
   macro avg       0.91      0.90      0.90     35000
weighted avg       0.91      0.90      0.90     35000



In [26]:
# Dự đoán trên tập kiểm tra
y_pred1 = svm_classifier.predict(X_test_reduced)

# Đánh giá mô hình
accuracy1 = accuracy_score(y_test1, y_pred1)
report1 = classification_report(y_test1, y_pred1)

print(f'Accuracy: {accuracy1}')
print('Classification Report:')
print(report1)

Accuracy: 0.8916666666666667
Classification Report:
              precision    recall  f1-score   support

          -1       0.90      0.88      0.89      7411
           1       0.88      0.91      0.89      7589

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



## Word2Vec SVM

### Load data

In [4]:
import numpy as np
import pickle
with open("../resources/X_w2v_train.pkl", "rb") as f:
    X_train = pickle.load(f)
with open("../resources/X_w2v_test.pkl", "rb") as f:
    X_test = pickle.load(f)
with open("../resources/y_train.pkl", "rb") as f:
    y_train = pickle.load(f)
with open("../resources/y_test.pkl", "rb") as f:
    y_test = pickle.load(f)

### Model

In [5]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [6]:
svm_classifier = SVC(
    random_state=42
)

In [7]:
svm_classifier.fit(X_train,y_train)

In [8]:

print(svm_classifier)

SVC(random_state=42)


In [9]:
# Dự đoán trên tập kiểm tra
y_pred_train = svm_classifier.predict(X_train)

# Đánh giá mô hình
accuracy_train = accuracy_score(y_train, y_pred_train)
report_train = classification_report(y_train, y_pred_train)

print(f'Accuracy: {accuracy_train}')
print('Classification Report:')
print(report_train)

Accuracy: 0.896
Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.89      0.90     17589
    positive       0.89      0.90      0.90     17411

    accuracy                           0.90     35000
   macro avg       0.90      0.90      0.90     35000
weighted avg       0.90      0.90      0.90     35000



In [33]:
# Dự đoán trên tập kiểm tra
y_pred = svm_classifier.predict(X_test)

# Đánh giá mô hình
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.8911333333333333
Classification Report:
              precision    recall  f1-score   support

    negative       0.89      0.89      0.89      7411
    positive       0.89      0.89      0.89      7589

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



### Sử dụng tập bé để tìm ra các tham số thích hợp

In [3]:
# Lọc các dòng có nhãn 1 và -1
df_pos = df[df['sentiment'] == 'positive']
df_neg = df[df['sentiment'] == 'negative']

# Lấy ngẫu nhiên 2300 dòng có nhãn positive và 2500 dòng có nhãn negative
df_pos_sampled = df_pos.sample(n=2500, random_state=42)
df_neg_sampled = df_neg.sample(n=2300, random_state=42)

# Kết hợp lại thành tập dữ liệu mẫu
df_sampled = pd.concat([df_pos_sampled, df_neg_sampled])

# Kiểm tra kết quả
print(df_sampled['sentiment'].value_counts())

sentiment
positive    2500
negative    2300
Name: count, dtype: int64


In [4]:
#Xử lý dữ liệu như trên
df_sampled['review'] = df_sampled['review'].str.replace("<br />", "")
df_sampled['review'] = df_sampled['review'].str.translate(str.maketrans('', '', string.punctuation))
df_sampled['review'] = df_sampled['review'].str.strip()
df_sampled['review'] = df_sampled['review'].str.replace(r'\s+',' ', regex=True)
# Tải stop words từ nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stop_words(text):
    words = text.split()
    filter_words = [word for word in words if word not in stop_words]
    return ' '.join(filter_words)
df_sampled['review'] = df_sampled['review'].apply(remove_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#Chuyển nhã sang số
def sentiment_encode(text):
    if(text == "positive"):
        return 1
    return -1
df_sampled['sentiment'] = df_sampled['sentiment'].apply(sentiment_encode)

In [6]:
# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

X_train2, X_test2, y_train2, y_test2 = train_test_split(df_sampled['review'], df_sampled['sentiment'], test_size=0.3, random_state=42)


In [7]:
# Chuyển đổi văn bản thành các đặc trưng số sử dụng TfidfVectorizer
vectorizer2 = TfidfVectorizer(max_features=500,)
X_train_tfidf2 = vectorizer2.fit_transform(X_train2)
X_test_tfidf2 = vectorizer2.transform(X_test2)

In [8]:
#SVM
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [10, 50, 100],  # Regularization parameter
    'gamma': [ 0.01, 0.1, 1.0],  # Kernel coefficient
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel type
    'degree': [3, 5],  # Chỉ có hiệu lực với kernel 'poly' (bậc đa thức)
    'coef0': [0.1, 0.5],  # Chỉ có hiệu lực với kernel 'poly'
}

grid_search = GridSearchCV(
    SVC(random_state=42),  # Mô hình SVM
    param_grid,             # Tham số tìm kiếm
)

In [9]:
print("X_train_tfidf2 shape:", X_train_tfidf2.shape)
print("y_train2 shape:", y_train2.shape)

X_train_tfidf2 shape: (3360, 500)
y_train2 shape: (3360,)


In [10]:
print("Unique classes in y_train:", set(y_train2))
print("Class distribution:\n", pd.Series(y_train2).value_counts())

Unique classes in y_train: {1, -1}
Class distribution:
 sentiment
 1    1733
-1    1627
Name: count, dtype: int64


In [11]:
grid_search.fit(X_train_tfidf2, y_train2)


In [12]:
# Chuyển kết quả từ cv_results_ thành DataFrame
results = pd.DataFrame(grid_search.cv_results_)

# Chọn các cột quan trọng để hiển thị
results_summary = results[[
    'param_C', 
    'param_gamma', 
    'param_kernel', 
    'param_degree', 
    'param_coef0', 
    'mean_test_score', 
    'std_test_score', 
    'rank_test_score'
]]

# Sắp xếp kết quả theo rank_test_score (xếp hạng từ tốt nhất đến kém nhất)
results_summary = results_summary.sort_values(by='rank_test_score')

# In ra bảng kết quả
# Tăng độ rộng hiển thị
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.width', 0)
print("Bảng kết quả GridSearch:")
print(results_summary)

Bảng kết quả GridSearch:
    param_C  param_gamma param_kernel  param_degree  param_coef0  mean_test_score  std_test_score  rank_test_score
56       50         0.01         poly             3          0.5         0.821429        0.006980                1
77      100         0.10         poly             3          0.1         0.819940        0.005407                2
32       10         0.10         poly             5          0.5         0.818452        0.004514                3
35       10         1.00         poly             5          0.5         0.818155        0.008008                4
71       50         1.00         poly             5          0.5         0.818155        0.008008                4
..      ...          ...          ...           ...          ...              ...             ...              ...
11       10         0.01         poly             5          0.1         0.515774        0.000729              102
50       50         0.10         poly             5    