# Sentiment analysis

## Tf-idf

In [1]:
import pandas as pd 

# Load the data
data = pd.read_csv(r"../resources/processed_data.csv")
data.head()

Unnamed: 0,Processed_Review,sentiment
0,one review mention watch oz episod hook right ...,positive
1,wonder littl product film techniqu unassum old...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic famili littl boy jake think zombi closet...,negative
4,petter mattei love time money visual stun film...,positive


In [2]:

label_counts = data['sentiment'].value_counts()

label_ratios = label_counts / len(data) * 100 

print("Tỷ lệ các nhãn:")
print(label_ratios)

Tỷ lệ các nhãn:
sentiment
positive    50.187568
negative    49.812432
Name: count, dtype: float64


### Chia tập dữ liệu

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# Chuyển dữ liệu sang đặc trưng số sử dụng TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2)) 
X = tfidf.fit_transform(data['Processed_Review']).toarray()
# Chia data thành các tập train và test
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

## Word2Vec

In [5]:
import pickle 
with open("../resources/X_w2v_train.pkl", "rb") as f:
    X_w2v_train = pickle.load(f)
with open("../resources/X_w2v_test.pkl", "rb") as f:
    X_w2v_test = pickle.load(f)
with open("../resources/y_train.pkl", "rb") as f:
    y_w2v_train = pickle.load(f)
with open("../resources/y_test.pkl", "rb") as f:
    y_w2v_test = pickle.load(f)

## Mô hình

### Decision tree (Không tham số - tfidf)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Huấn luyện mô hình Decision Tree
clf = DecisionTreeClassifier(
    random_state=42, 
)
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Dự đoán trên tập huấn luyện
y_pred_train = clf.predict(X_train)

# Tính toán độ chính xác và báo cáo phân loại cho tập huấn luyện
accuracy_train = accuracy_score(y_train, y_pred_train)
report_train = classification_report(y_train, y_pred_train)

# In kết quả cho tập huấn luyện
print(f'Accuracy on training set: {accuracy_train}')
print('Classification Report on training set:')
print(report_train)

# Dự đoán trên tập kiểm tra
y_pred_test = clf.predict(X_test)

# Tính toán độ chính xác và báo cáo phân loại cho tập kiểm tra
accuracy_test = accuracy_score(y_test, y_pred_test)
report_test = classification_report(y_test, y_pred_test)

# In kết quả cho tập kiểm tra
print(f'Accuracy on test set: {accuracy_test}')
print('Classification Report on test set:')
print(report_test)

Accuracy on training set: 1.0
Classification Report on training set:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     17237
    positive       1.00      1.00      1.00     17470

    accuracy                           1.00     34707
   macro avg       1.00      1.00      1.00     34707
weighted avg       1.00      1.00      1.00     34707

Accuracy on test set: 0.7169747899159664
Classification Report on test set:
              precision    recall  f1-score   support

    negative       0.72      0.71      0.72      7461
    positive       0.71      0.72      0.72      7414

    accuracy                           0.72     14875
   macro avg       0.72      0.72      0.72     14875
weighted avg       0.72      0.72      0.72     14875



In [None]:
print(f'Depth of the tree: {clf.get_depth()}')
print(f'Number of leaves: {clf.get_n_leaves()}')
print(f'Number of features: {clf.n_features_in_}')
print(f'Feature importances: {clf.feature_importances_}')
print(f'max_depth: {clf.max_depth}')
print(f'min_samples_split: {clf.min_samples_split}')
print(f'min_samples_leaf: {clf.min_samples_leaf}')
print(f'criterion: {clf.criterion}')

Depth of the tree: 152
Number of leaves: 3162
Number of features: 10000
Feature importances: [0.         0.00043015 0.         ... 0.         0.         0.        ]
max_depth: None
min_samples_split: 2
min_samples_leaf: 1
criterion: gini


### Decision tree (Không tham số - word2vec)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Huấn luyện mô hình Decision Tree
clf = DecisionTreeClassifier(
    random_state=42,
)
clf.fit(X_w2v_train, y_w2v_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Dự đoán trên tập huấn luyện
y_w2v_pred_train = clf.predict(X_w2v_train)

# Tính toán độ chính xác và báo cáo phân loại cho tập huấn luyện
accuracy_train = accuracy_score(y_w2v_train, y_w2v_pred_train)
report_train = classification_report(y_w2v_train, y_w2v_pred_train)

# In kết quả cho tập huấn luyện
print(f'Accuracy on training set: {accuracy_train}')
print('Classification Report on training set:')
print(report_train)

# Dự đoán trên tập kiểm tra
y_w2v_pred_test = clf.predict(X_w2v_test)

# Tính toán độ chính xác và báo cáo phân loại cho tập kiểm tra
accuracy_test = accuracy_score(y_w2v_test, y_w2v_pred_test)
report_test = classification_report(y_w2v_test, y_w2v_pred_test)

# In kết quả cho tập kiểm tra
print(f'Accuracy on test set: {accuracy_test}')
print('Classification Report on test set:')
print(report_test)

Accuracy on training set: 0.8324857142857143
Classification Report on training set:
              precision    recall  f1-score   support

    negative       0.82      0.85      0.84     17589
    positive       0.84      0.82      0.83     17411

    accuracy                           0.83     35000
   macro avg       0.83      0.83      0.83     35000
weighted avg       0.83      0.83      0.83     35000

Accuracy on test set: 0.7961333333333334
Classification Report on test set:
              precision    recall  f1-score   support

    negative       0.78      0.81      0.80      7411
    positive       0.81      0.78      0.79      7589

    accuracy                           0.80     15000
   macro avg       0.80      0.80      0.80     15000
weighted avg       0.80      0.80      0.80     15000



In [None]:
print(f'Depth of the tree: {clf.get_depth()}')
print(f'Number of leaves: {clf.get_n_leaves()}')
print(f'Number of features: {clf.n_features_in_}')
print(f'Feature importances: {clf.feature_importances_}')
print(f'max_depth: {clf.max_depth}')
print(f'min_samples_split: {clf.min_samples_split}')
print(f'min_samples_leaf: {clf.min_samples_leaf}')
print(f'criterion: {clf.criterion}')

Depth of the tree: 7
Number of leaves: 127
Number of features: 400
Feature importances: [0.         0.         0.         0.         0.         0.
 0.03238466 0.00146859 0.01714262 0.         0.         0.
 0.00093593 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.01193756 0.00789582 0.         0.         0.00191763 0.0043235
 0.         0.         0.         0.         0.         0.09599793
 0.00436029 0.         0.         0.         0.         0.
 0.         0.         0.00613223 0.         0.         0.
 0.         0.         0.00073107 0.         0.         0.
 0.         0.         0.         0.         0.00265088 0.
 0.02328013 0.         0.00052367 0.00087825 0.         0.
 0.         0.00549852 0.00112426 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         

### Áp dụng phương pháp GridSearch chọn các hyperparameter (tf-idf)

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'max_depth': [20, 30, 40],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 2, 3],
    'criterion': ['gini', 'entropy'],
}

grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5),
)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)


In [None]:
print(grid_search.best_estimator_)

DecisionTreeClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=5)


#### Độ chính xác của mô hình với cách chọn tham số Grid Search (tf-idf)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small training set using the best estimator
y_pred_train = grid_search.best_estimator_.predict(X_train)

# Calculate accuracy and classification report
accuracy_train = accuracy_score(y_train, y_pred_train)
report_train = classification_report(y_train, y_pred_train)

print(f'Accuracy: {accuracy_train}')
print('Classification Report:')
print(report_train)

Accuracy: 0.8352571428571428
Classification Report:
              precision    recall  f1-score   support

          -1       0.90      0.75      0.82     17589
           1       0.79      0.92      0.85     17411

    accuracy                           0.84     35000
   macro avg       0.84      0.84      0.83     35000
weighted avg       0.84      0.84      0.83     35000



In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small test set using the best estimator
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.7463333333333333
Classification Report:
              precision    recall  f1-score   support

          -1       0.79      0.66      0.72      7411
           1       0.72      0.83      0.77      7589

    accuracy                           0.75     15000
   macro avg       0.75      0.75      0.74     15000
weighted avg       0.75      0.75      0.74     15000



#### Kiểm tra các thuộc tính của cây 

In [None]:
best_clf = grid_search.best_estimator_

print(f'Depth of the tree: {best_clf.get_depth()}')
print(f'Number of leaves: {best_clf.get_n_leaves()}')
print(f'Number of features: {best_clf.n_features_in_}')
print(f'Feature importances: {best_clf.feature_importances_}')

Depth of the tree: 20
Number of leaves: 879
Number of features: 5000
Feature importances: [0. 0. 0. ... 0. 0. 0.]


### Áp dụng phương pháp GridSearch chọn các hyperparameter (word2vec)

In [16]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'max_depth': [5, 7, 10],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2],
    'criterion': ['gini', 'entropy'],
}

grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5),
)
grid_search.fit(X_w2v_train, y_w2v_train)
print(grid_search.best_params_)


{'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [17]:
print(grid_search.best_estimator_)

DecisionTreeClassifier(criterion='entropy', max_depth=7)


#### Độ chính xác của mô hình với cách chọn tham số Grid Search (word2vec)

In [19]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small training set using the best estimator
y_w2v_pred_train = grid_search.best_estimator_.predict(X_w2v_train)

# Calculate accuracy and classification report
accuracy_train = accuracy_score(y_w2v_train, y_w2v_pred_train)
report_train = classification_report(y_w2v_train, y_w2v_pred_train)

print(f'Accuracy: {accuracy_train}')
print('Classification Report:')
print(report_train)

Accuracy: 0.8238
Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.82      0.82     17589
    positive       0.82      0.83      0.82     17411

    accuracy                           0.82     35000
   macro avg       0.82      0.82      0.82     35000
weighted avg       0.82      0.82      0.82     35000



In [21]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small test set using the best estimator
y_w2v_pred = grid_search.best_estimator_.predict(X_w2v_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_w2v_test, y_w2v_pred)
report = classification_report(y_w2v_test, y_w2v_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.7933333333333333
Classification Report:
              precision    recall  f1-score   support

    negative       0.80      0.78      0.79      7411
    positive       0.79      0.81      0.80      7589

    accuracy                           0.79     15000
   macro avg       0.79      0.79      0.79     15000
weighted avg       0.79      0.79      0.79     15000



#### Kiểm tra các thuộc tính của cây 

In [None]:
best_clf = grid_search.best_estimator_

print(f'Depth of the tree: {best_clf.get_depth()}')
print(f'Number of leaves: {best_clf.get_n_leaves()}')
print(f'Number of features: {best_clf.n_features_in_}')
print(f'Feature importances: {best_clf.feature_importances_}')

Depth of the tree: 20
Number of leaves: 879
Number of features: 5000
Feature importances: [0. 0. 0. ... 0. 0. 0.]


### Bonus: Train trên dữ liệu nhỏ (tf-idf)

In [6]:
import pandas as pd
from scipy.sparse import csr_matrix

# Convert numpy arrays to sparse matrices
X_train_sparse = csr_matrix(X_train)
X_test_sparse = csr_matrix(X_test)

# Convert sparse matrices to dense DataFrames
X_train_df = pd.DataFrame.sparse.from_spmatrix(X_train_sparse)
X_test_df = pd.DataFrame.sparse.from_spmatrix(X_test_sparse)

# Reset indices to ensure alignment
X_train_df.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test_df.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Sample from the dense DataFrames
small_X_train_df = X_train_df.sample(n=5000, random_state=42)
small_y_train = y_train.loc[small_X_train_df.index]

small_X_test_df = X_test_df.sample(n=1500 , random_state=42)
small_y_test = y_test.loc[small_X_test_df.index]

# Convert back to sparse matrices if needed
small_X_train = csr_matrix(small_X_train_df)
small_X_test = csr_matrix(small_X_test_df)

In [7]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'max_depth': [20, 30],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2],
    'criterion': ['gini', 'entropy'],
}

grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5),
)
grid_search.fit(small_X_train, small_y_train)
print(grid_search.best_params_)

{'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [8]:
print(grid_search.best_estimator_)

DecisionTreeClassifier(max_depth=20)


#### Độ chính xác của cách chọn tham số Grid Search trên tập dữ liệu nhỏ (tf-idf)

In [9]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small training set using the best estimator
small_y_pred_train = grid_search.best_estimator_.predict(small_X_train)

# Calculate accuracy and classification report
accuracy_train = accuracy_score(small_y_train, small_y_pred_train)
report_train = classification_report(small_y_train, small_y_pred_train)

print(f'Accuracy: {accuracy_train}')
print('Classification Report:')
print(report_train)

Accuracy: 0.905
Classification Report:
              precision    recall  f1-score   support

    negative       0.98      0.83      0.90      2471
    positive       0.85      0.98      0.91      2529

    accuracy                           0.91      5000
   macro avg       0.92      0.90      0.90      5000
weighted avg       0.91      0.91      0.90      5000



In [10]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small test set using the best estimator
small_y_pred = grid_search.best_estimator_.predict(small_X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(small_y_test, small_y_pred)
report = classification_report(small_y_test, small_y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.692
Classification Report:
              precision    recall  f1-score   support

    negative       0.73      0.63      0.68       765
    positive       0.66      0.76      0.71       735

    accuracy                           0.69      1500
   macro avg       0.70      0.69      0.69      1500
weighted avg       0.70      0.69      0.69      1500



### Bonus: Train trên dữ liệu nhỏ (word2vec)

In [11]:
import pandas as pd
from scipy.sparse import csr_matrix

# Convert numpy arrays to sparse matrices
X_w2v_train_sparse = csr_matrix(X_w2v_train)
X_w2v_test_sparse = csr_matrix(X_w2v_test)

# Convert sparse matrices to dense DataFrames
X_w2v_train_df = pd.DataFrame.sparse.from_spmatrix(X_w2v_train_sparse)
X_w2v_test_df = pd.DataFrame.sparse.from_spmatrix(X_w2v_test_sparse)

# Reset indices to ensure alignment
X_w2v_train_df.reset_index(drop=True, inplace=True)
y_w2v_train.reset_index(drop=True, inplace=True)
X_w2v_test_df.reset_index(drop=True, inplace=True)
y_w2v_test.reset_index(drop=True, inplace=True)

# Sample from the dense DataFrames
small_X_w2v_train_df = X_w2v_train_df.sample(n=5000, random_state=42)
small_y_w2v_train = y_w2v_train.loc[small_X_w2v_train_df.index]

small_X_w2v_test_df = X_w2v_test_df.sample(n=1500 , random_state=42)
small_y_w2v_test = y_w2v_test.loc[small_X_w2v_test_df.index]

# Convert back to sparse matrices if needed
small_X_w2v_train = csr_matrix(small_X_w2v_train_df)
small_X_w2v_test = csr_matrix(small_X_w2v_test_df)

In [12]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'max_depth': [5, 7, 10],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2],
    'criterion': ['gini', 'entropy'],
}

grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5),
)
grid_search.fit(small_X_w2v_train, small_y_w2v_train)
print(grid_search.best_params_)

{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 3}


In [13]:
print(grid_search.best_estimator_)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=3)


#### Độ chính xác của cách chọn tham số Grid Search trên tập dữ liệu nhỏ (word2vec)

In [14]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small training set using the best estimator
small_y_w2v_pred_train = grid_search.best_estimator_.predict(small_X_w2v_train)

# Calculate accuracy and classification report
accuracy_train = accuracy_score(small_y_w2v_train, small_y_w2v_pred_train)
report_train = classification_report(small_y_w2v_train, small_y_w2v_pred_train)

print(f'Accuracy: {accuracy_train}')
print('Classification Report:')
print(report_train)

Accuracy: 0.8218
Classification Report:
              precision    recall  f1-score   support

    negative       0.81      0.85      0.83      2525
    positive       0.84      0.79      0.81      2475

    accuracy                           0.82      5000
   macro avg       0.82      0.82      0.82      5000
weighted avg       0.82      0.82      0.82      5000



In [15]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the small test set using the best estimator
small_y_w2v_pred = grid_search.best_estimator_.predict(small_X_w2v_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(small_y_w2v_test, small_y_w2v_pred)
report = classification_report(small_y_w2v_test, small_y_w2v_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.7766666666666666
Classification Report:
              precision    recall  f1-score   support

    negative       0.76      0.81      0.78       740
    positive       0.80      0.75      0.77       760

    accuracy                           0.78      1500
   macro avg       0.78      0.78      0.78      1500
weighted avg       0.78      0.78      0.78      1500

