# Sentiment analysis

## Tf-idf

In [1]:
import pandas as pd 

# Load the data
data = pd.read_csv(r"../resources/processed_data.csv")
data.head()

Unnamed: 0,Processed_Review,sentiment
0,one review mention watch oz episod hook right ...,positive
1,wonder littl product film techniqu unassum old...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic famili littl boy jake think zombi closet...,negative
4,petter mattei love time money visual stun film...,positive


In [2]:

label_counts = data['sentiment'].value_counts()

label_ratios = label_counts / len(data) * 100 

print("Tỷ lệ các nhãn:")
print(label_ratios)

Tỷ lệ các nhãn:
sentiment
positive    50.187568
negative    49.812432
Name: count, dtype: float64


### Chia tập dữ liệu

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# Chuyển dữ liệu sang đặc trưng số sử dụng TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2)) 
X = tfidf.fit_transform(data['Processed_Review']).toarray()
# Chia data thành các tập train và test
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

## Word2Vec

In [5]:
import pickle 
with open("../resources/X_w2v_train.pkl", "rb") as f:
    X_w2v_train = pickle.load(f)
with open("../resources/X_w2v_test.pkl", "rb") as f:
    X_w2v_test = pickle.load(f)
with open("../resources/y_train.pkl", "rb") as f:
    y_w2v_train = pickle.load(f)
with open("../resources/y_test.pkl", "rb") as f:
    y_w2v_test = pickle.load(f)

## Mô hình

### Random forest (Không tham số - tfidf)

In [7]:
# Import các thư viện cần thiết
from sklearn.ensemble import RandomForestClassifier

# Huấn luyện mô hình Random Forest
rf_clf = RandomForestClassifier(
    random_state=42,
)
rf_clf.fit(X_train, y_train)


In [8]:
# In ra các thông số của mô hình
print(f'n_estimators: {rf_clf.n_estimators}')
print(f'max_depth: {rf_clf.max_depth}')
print(f'min_samples_leaf: {rf_clf.min_samples_leaf}')
print(f'max_features: {rf_clf.max_features}')
print(f'max_samples: {rf_clf.max_samples}')

for id, estimator in enumerate(rf_clf.estimators_):
    print(f'Tree {id}:')
    print(f'  Depth of the tree: {estimator.get_depth()}')
    print(f'  Number of leaves: {estimator.get_n_leaves()}')

depths = [estimator.get_depth() for estimator in rf_clf.estimators_]
n_leaves = [estimator.get_n_leaves() for estimator in rf_clf.estimators_]

print(f'Average depth of the trees: {sum(depths) / len(depths)}')
print(f'Average number of leaves: {sum(n_leaves) / len(n_leaves)}')

n_estimators: 100
max_depth: None
min_samples_leaf: 1
max_features: sqrt
max_samples: None
Tree 0:
  Depth of the tree: 167
  Number of leaves: 5151
Tree 1:
  Depth of the tree: 171
  Number of leaves: 5237
Tree 2:
  Depth of the tree: 228
  Number of leaves: 5411
Tree 3:
  Depth of the tree: 191
  Number of leaves: 5478
Tree 4:
  Depth of the tree: 159
  Number of leaves: 5611
Tree 5:
  Depth of the tree: 206
  Number of leaves: 5467
Tree 6:
  Depth of the tree: 212
  Number of leaves: 5416
Tree 7:
  Depth of the tree: 208
  Number of leaves: 5388
Tree 8:
  Depth of the tree: 180
  Number of leaves: 5211
Tree 9:
  Depth of the tree: 221
  Number of leaves: 5489
Tree 10:
  Depth of the tree: 184
  Number of leaves: 5296
Tree 11:
  Depth of the tree: 196
  Number of leaves: 5365
Tree 12:
  Depth of the tree: 215
  Number of leaves: 5501
Tree 13:
  Depth of the tree: 153
  Number of leaves: 5903
Tree 14:
  Depth of the tree: 176
  Number of leaves: 5463
Tree 15:
  Depth of the tree: 202


In [None]:
print(f'Number of features: {rf_clf.n_features_in_}')
print(f'Feature importances: {rf_clf.feature_importances_}')

Number of features: 10000
Feature importances: [1.78242988e-05 8.65192044e-05 1.03446717e-05 ... 3.42599039e-05
 5.32073926e-05 1.40712226e-05]


In [9]:
# Dự đoán trên tập train
y_pred_train = rf_clf.predict(X_train)

# Đánh giá mô hình
accuracy_train = accuracy_score(y_train, y_pred_train)
report_train = classification_report(y_train, y_pred_train)

print(f'Accuracy: {accuracy_train}')
print('Classification Report:')
print(report_train)

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     17237
    positive       1.00      1.00      1.00     17470

    accuracy                           1.00     34707
   macro avg       1.00      1.00      1.00     34707
weighted avg       1.00      1.00      1.00     34707



In [10]:
# Dự đoán trên tập test
y_pred = rf_clf.predict(X_test)

# Đánh giá mô hình
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)



Accuracy: 0.8532436974789916
Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.85      0.85      7461
    positive       0.85      0.85      0.85      7414

    accuracy                           0.85     14875
   macro avg       0.85      0.85      0.85     14875
weighted avg       0.85      0.85      0.85     14875



### Random forest (Không tham số - word2vec)

In [21]:
# Import các thư viện cần thiết
from sklearn.ensemble import RandomForestClassifier

# Huấn luyện mô hình Random Forest
rf_clf = RandomForestClassifier(
    random_state=42,
)
rf_clf.fit(X_w2v_train, y_w2v_train)


In [22]:
# In ra các thông số của mô hình
print(f'n_estimators: {rf_clf.n_estimators}')
print(f'max_depth: {rf_clf.max_depth}')
print(f'min_samples_leaf: {rf_clf.min_samples_leaf}')
print(f'max_features: {rf_clf.max_features}')
print(f'max_samples: {rf_clf.max_samples}')

for id, estimator in enumerate(rf_clf.estimators_):
    print(f'Tree {id}:')
    print(f'  Depth of the tree: {estimator.get_depth()}')
    print(f'  Number of leaves: {estimator.get_n_leaves()}')

depths = [estimator.get_depth() for estimator in rf_clf.estimators_]
n_leaves = [estimator.get_n_leaves() for estimator in rf_clf.estimators_]

print(f'Average depth of the trees: {sum(depths) / len(depths)}')
print(f'Average number of leaves: {sum(n_leaves) / len(n_leaves)}')

n_estimators: 100
max_depth: None
min_samples_leaf: 1
max_features: sqrt
max_samples: None
Tree 0:
  Depth of the tree: 33
  Number of leaves: 2626
Tree 1:
  Depth of the tree: 40
  Number of leaves: 2710
Tree 2:
  Depth of the tree: 32
  Number of leaves: 2672
Tree 3:
  Depth of the tree: 33
  Number of leaves: 2656
Tree 4:
  Depth of the tree: 29
  Number of leaves: 2600
Tree 5:
  Depth of the tree: 31
  Number of leaves: 2646
Tree 6:
  Depth of the tree: 35
  Number of leaves: 2603
Tree 7:
  Depth of the tree: 33
  Number of leaves: 2634
Tree 8:
  Depth of the tree: 33
  Number of leaves: 2655
Tree 9:
  Depth of the tree: 34
  Number of leaves: 2638
Tree 10:
  Depth of the tree: 32
  Number of leaves: 2616
Tree 11:
  Depth of the tree: 30
  Number of leaves: 2729
Tree 12:
  Depth of the tree: 30
  Number of leaves: 2675
Tree 13:
  Depth of the tree: 27
  Number of leaves: 2646
Tree 14:
  Depth of the tree: 29
  Number of leaves: 2649
Tree 15:
  Depth of the tree: 36
  Number of leav

In [23]:
print(f'Number of features: {rf_clf.n_features_in_}')
print(f'Feature importances: {rf_clf.feature_importances_}')

Number of features: 400
Feature importances: [0.00250157 0.00108507 0.00124103 0.00101977 0.0009289  0.00116387
 0.00988061 0.01174773 0.00353146 0.00202011 0.0039592  0.00121446
 0.00112631 0.00308523 0.00121656 0.00126149 0.00109284 0.00115691
 0.00109636 0.00132356 0.00119286 0.00153817 0.00120562 0.001813
 0.00317641 0.00307448 0.00135237 0.00124751 0.00635582 0.00100653
 0.00864827 0.00934849 0.00118691 0.00151016 0.00311503 0.00589958
 0.00134518 0.00119834 0.00111537 0.00114217 0.001275   0.01380925
 0.01899741 0.00099133 0.00120015 0.00109478 0.0010649  0.00166735
 0.00200997 0.00126557 0.0137603  0.0012113  0.00204243 0.00115016
 0.00153908 0.00104006 0.00134308 0.00206127 0.00134187 0.00350981
 0.00100551 0.00103249 0.00153006 0.00106235 0.00606029 0.00098707
 0.01373141 0.00103794 0.00194688 0.00223084 0.00108835 0.00110091
 0.00114374 0.00212458 0.00393535 0.00106837 0.00109518 0.00327657
 0.00103284 0.00115274 0.00112219 0.00103832 0.00103661 0.00102024
 0.00923889 0.00216

In [24]:
# Dự đoán trên tập train
y_w2v_pred_train = rf_clf.predict(X_w2v_train)

# Đánh giá mô hình
accuracy_train = accuracy_score(y_w2v_train, y_w2v_pred_train)
report_train = classification_report(y_w2v_train, y_w2v_pred_train)

print(f'Accuracy: {accuracy_train}')
print('Classification Report:')
print(report_train)

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     17589
    positive       1.00      1.00      1.00     17411

    accuracy                           1.00     35000
   macro avg       1.00      1.00      1.00     35000
weighted avg       1.00      1.00      1.00     35000



In [25]:
# Dự đoán trên tập test
y_w2v_pred = rf_clf.predict(X_w2v_test)

# Đánh giá mô hình
accuracy = accuracy_score(y_w2v_test, y_w2v_pred)
report = classification_report(y_w2v_test, y_w2v_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)



Accuracy: 0.8653333333333333
Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.85      0.86      7411
    positive       0.86      0.88      0.87      7589

    accuracy                           0.87     15000
   macro avg       0.87      0.87      0.87     15000
weighted avg       0.87      0.87      0.87     15000



### Áp dụng phương pháp GridSearch chọn các hyperparameter (tf-idf)

In [15]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [30, 50, None],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'max_samples': [0.5, None],
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5),
)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

{'max_depth': 20, 'max_features': 'sqrt', 'max_samples': None, 'min_samples_leaf': 1, 'n_estimators': 100}


#### Độ chính xác của mô hình với cách chọn tham số Grid Search (tf-idf)

In [16]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small training set using the best estimator
y_pred_train = grid_search.best_estimator_.predict(X_train)

# Calculate accuracy and classification report
accuracy_train = accuracy_score(y_train, y_pred_train)
report_train = classification_report(y_train, y_pred_train) 

print(f'Accuracy: {accuracy_train}')
print('Classification Report:')
print(report_train)

Accuracy: 0.9234160255856168
Classification Report:
              precision    recall  f1-score   support

    negative       0.97      0.87      0.92     17237
    positive       0.89      0.97      0.93     17470

    accuracy                           0.92     34707
   macro avg       0.93      0.92      0.92     34707
weighted avg       0.93      0.92      0.92     34707



In [17]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small test set using the best estimator
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.8372436974789916
Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.79      0.83      7461
    positive       0.81      0.88      0.84      7414

    accuracy                           0.84     14875
   macro avg       0.84      0.84      0.84     14875
weighted avg       0.84      0.84      0.84     14875



#### Kiểm tra các thuộc tính của cây 

In [None]:
best_clf = grid_search.best_estimator_

print(f'Depth of the tree: {best_clf.get_depth()}')
print(f'Number of leaves: {best_clf.get_n_leaves()}')
print(f'Number of features: {best_clf.n_features_in_}')
print(f'Feature importances: {best_clf.feature_importances_}')

### Áp dụng phương pháp GridSearch chọn các hyperparameter (word2vec)

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [30, 50, None],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'max_samples': [0.5, None],
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5),
)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

#### Độ chính xác của mô hình với cách chọn tham số Grid Search (word2vec)

In [25]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small training set using the best estimator
y_pred_train = grid_search.best_estimator_.predict(X_train)

# Calculate accuracy and classification report
accuracy_train = accuracy_score(y_train, y_pred_train)
report_train = classification_report(y_train, y_pred_train) 

print(f'Accuracy: {accuracy_train}')
print('Classification Report:')
print(report_train)

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     17237
    positive       1.00      1.00      1.00     17470

    accuracy                           1.00     34707
   macro avg       1.00      1.00      1.00     34707
weighted avg       1.00      1.00      1.00     34707



In [26]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small test set using the best estimator
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.8540504201680672
Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.84      0.85      7461
    positive       0.85      0.87      0.86      7414

    accuracy                           0.85     14875
   macro avg       0.85      0.85      0.85     14875
weighted avg       0.85      0.85      0.85     14875



#### Kiểm tra các thuộc tính của cây 

In [43]:
best_clf = grid_search.best_estimator_

print(f'Number of features: {best_clf.n_features_in_}')
print(f'Feature importances: {best_clf.feature_importances_}')

Number of features: 10000
Feature importances: [1.02874654e-06 5.47160727e-05 5.67187317e-06 ... 1.01276447e-05
 2.46950782e-05 0.00000000e+00]


### Bonus: Train trên dữ liệu nhỏ (tf-idf)

In [13]:
import pandas as pd
from scipy.sparse import csr_matrix

# Convert numpy arrays to sparse matrices
X_train_sparse = csr_matrix(X_train)
X_test_sparse = csr_matrix(X_test)

# Convert sparse matrices to dense DataFrames
X_train_df = pd.DataFrame.sparse.from_spmatrix(X_train_sparse)
X_test_df = pd.DataFrame.sparse.from_spmatrix(X_test_sparse)

# Reset indices to ensure alignment
X_train_df.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test_df.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Sample from the dense DataFrames
small_X_train_df = X_train_df.sample(n=5000, random_state=42)
small_y_train = y_train.loc[small_X_train_df.index]

small_X_test_df = X_test_df.sample(n=1500 , random_state=42)
small_y_test = y_test.loc[small_X_test_df.index]

# Convert back to sparse matrices if needed
small_X_train = csr_matrix(small_X_train_df)
small_X_test = csr_matrix(small_X_test_df)

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [30, 50, None],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'max_samples': [0.5, None],
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5),
)
grid_search.fit(small_X_train, small_y_train)
print(grid_search.best_params_)

#### Độ chính xác của cách chọn tham số Grid Search trên tập dữ liệu nhỏ

In [29]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small training set using the best estimator
small_y_pred_train = grid_search.best_estimator_.predict(small_X_train)

# Calculate accuracy and classification report
accuracy_train = accuracy_score(small_y_train, small_y_pred_train)
report_train = classification_report(small_y_train, small_y_pred_train)

print(f'Accuracy: {accuracy_train}')
print('Classification Report:')
print(report_train)

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       512
           1       1.00      1.00      1.00       488

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



In [30]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small test set using the best estimator
small_y_pred = grid_search.best_estimator_.predict(small_X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(small_y_test, small_y_pred)
report = classification_report(small_y_test, small_y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.8175
Classification Report:
              precision    recall  f1-score   support

          -1       0.82      0.83      0.83       209
           1       0.81      0.80      0.81       191

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



### Bonus: Train trên dữ liệu nhỏ (word2vec)

In [26]:
import pandas as pd
from scipy.sparse import csr_matrix

# Convert numpy arrays to sparse matrices
X_w2v_train_sparse = csr_matrix(X_w2v_train)
X_w2v_test_sparse = csr_matrix(X_w2v_test)

# Convert sparse matrices to dense DataFrames
X_w2v_train_df = pd.DataFrame.sparse.from_spmatrix(X_w2v_train_sparse)
X_w2v_test_df = pd.DataFrame.sparse.from_spmatrix(X_w2v_test_sparse)

# Reset indices to ensure alignment
X_w2v_train_df.reset_index(drop=True, inplace=True)
y_w2v_train.reset_index(drop=True, inplace=True)
X_w2v_test_df.reset_index(drop=True, inplace=True)
y_w2v_test.reset_index(drop=True, inplace=True)

# Sample from the dense DataFrames
small_X_w2v_train_df = X_w2v_train_df.sample(n=5000, random_state=42)
small_y_w2v_train = y_w2v_train.loc[small_X_w2v_train_df.index]

small_X_w2v_test_df = X_w2v_test_df.sample(n=1500 , random_state=42)
small_y_w2v_test = y_w2v_test.loc[small_X_w2v_test_df.index]

# Convert back to sparse matrices if needed
small_X_w2v_train = csr_matrix(small_X_w2v_train_df)
small_X_w2v_test = csr_matrix(small_X_w2v_test_df)

In [36]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [30, 50, None],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'max_samples': [0.5, None],
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5),
)
grid_search.fit(small_X_w2v_train, small_y_w2v_train)
print(grid_search.best_params_)

{'max_depth': None, 'max_features': 'sqrt', 'max_samples': None, 'min_samples_leaf': 1, 'n_estimators': 100}


#### Độ chính xác của cách chọn tham số Grid Search trên tập dữ liệu nhỏ

In [37]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small training set using the best estimator
small_y_w2v_pred_train = grid_search.best_estimator_.predict(small_X_w2v_train)

# Calculate accuracy and classification report
accuracy_train = accuracy_score(small_y_w2v_train, small_y_w2v_pred_train)
report_train = classification_report(small_y_w2v_train, small_y_w2v_pred_train)

print(f'Accuracy: {accuracy_train}')
print('Classification Report:')
print(report_train)

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      2525
    positive       1.00      1.00      1.00      2475

    accuracy                           1.00      5000
   macro avg       1.00      1.00      1.00      5000
weighted avg       1.00      1.00      1.00      5000



In [38]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small test set using the best estimator
small_y_w2v_pred = grid_search.best_estimator_.predict(small_X_w2v_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(small_y_w2v_test, small_y_w2v_pred)
report = classification_report(small_y_w2v_test, small_y_w2v_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.862
Classification Report:
              precision    recall  f1-score   support

    negative       0.88      0.84      0.86       740
    positive       0.85      0.88      0.87       760

    accuracy                           0.86      1500
   macro avg       0.86      0.86      0.86      1500
weighted avg       0.86      0.86      0.86      1500

