## Load data

In [1]:
import pandas as pd

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
test_true_labels = pd.read_csv('data/test_true_labels.csv')

In [2]:
train_df

Unnamed: 0,helpfulness_cat,imdb_user_review
0,1.0,It is hard to find such delightful and adorabl...
1,1.0,"They don't make films like this faded, hauntin..."
2,1.0,I first viewed this movie in 1924 at age 6 yrs...
3,1.0,I doubt that I'd ever seen anything resembling...
4,1.0,I was shocked to find myself riveted to this m...
...,...,...
10750,0.0,The makers of this movie really touched a sore...
10751,0.0,I Care A Lot is an exhilarating black comedy w...
10752,0.0,Really loved this. This film is masterful in t...
10753,0.0,"The story, direction and acting across the boa..."


## Encoding

### doc2vec

In [37]:
from nltk.tokenize import wordpunct_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [60]:
# training data
train_tagged = [
    TaggedDocument(words=wordpunct_tokenize(text.lower()), tags=[str(i)])
    for i, text in enumerate(train_df["imdb_user_review"])
]

# testing data（no tags）
test_tagged = [
    wordpunct_tokenize(text.lower())
    for text in test_df["imdb_user_review"]
]


In [None]:
# train the doc2vec model
model_d2v = Doc2Vec(vector_size=128, window=3, min_count=1, workers=4, epochs=100)
model_d2v.build_vocab(train_tagged)
model_d2v.train(train_tagged, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)


In [62]:
import numpy as np

# encode
X_train = np.array([model_d2v.dv[str(i)] for i in range(len(train_tagged))])
y_train = train_df["helpfulness_cat"].values

X_test = np.array([model_d2v.infer_vector(words) for words in test_tagged])
y_test = test_true_labels["helpfulness_cat"]

### Topic Model

In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.utils import simple_preprocess

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

# Define preprocessing function:
# - lowercase and tokenize
# - remove stopwords
# - apply stemming
def preprocess(text):
    tokens = simple_preprocess(text, deacc=True)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing to each review
X_train_tokens = train_df["imdb_user_review"].astype(str).apply(preprocess)
X_test_tokens = test_df["imdb_user_review"].astype(str).apply(preprocess)

[nltk_data] Downloading package stopwords to /Users/adam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
### Create dictionary and corpus (based only on the training set)

from gensim.corpora import Dictionary
from gensim.models import LdaModel

# Create a dictionary and bag-of-words corpus for the training set
dictionary = Dictionary(X_train_tokens)
train_corpus = [dictionary.doc2bow(tokens) for tokens in X_train_tokens]
test_corpus = [dictionary.doc2bow(tokens) for tokens in X_test_tokens]

In [None]:
### Train multiple LDA models with different numbers of topics

#topic_nums = [25]
topic_nums = [5, 10, 15, 20, 25, 30, 35, 40]
lda_models = {}

for num_topics in topic_nums:
    lda = LdaModel(corpus=train_corpus,
                   id2word=dictionary,
                   num_topics=num_topics,
                   passes=10,
                   random_state=38)
    lda_models[num_topics] = lda
    print(f"Finish training lda_models {num_topics}")

Finish training lda_models 25


In [21]:
### Convert each review to a topic distribution vector

import numpy as np

def get_topic_vector(model, corpus, num_topics):
    topic_vecs = []
    for doc_bow in corpus:
        doc_topics = model.get_document_topics(doc_bow, minimum_probability=0)
        vec = [prob for _, prob in sorted(doc_topics)]
        topic_vecs.append(vec)
    return np.array(topic_vecs)

In [72]:
### Train classifiers and evaluate performance for each topic model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

results = []

for num_topics, lda_model in lda_models.items():
    X_train_vec = get_topic_vector(lda_model, train_corpus, num_topics)
    X_test_vec = get_topic_vector(lda_model, test_corpus, num_topics)
    
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    y_prob = clf.predict_proba(X_test_vec)[:, 1]

    results.append({
        "num_topics": num_topics,
        "accuracy": accuracy_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "auc": roc_auc_score(y_test, y_prob)
    })

# Show results sorted by F1-score
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="f1_score", ascending=False))

   num_topics  accuracy  f1_score       auc
4          25  0.773615  0.871732  0.672734
1          10  0.772234  0.871424  0.651568
0           5  0.772037  0.871355  0.643560
3          20  0.772037  0.871355  0.658504
6          35  0.772037  0.871183  0.656774
2          15  0.772629  0.871101  0.656515
7          40  0.771248  0.870420  0.652376
5          30  0.768290  0.868847  0.652619


In [74]:
X_train = get_topic_vector(lda_models[25], train_corpus, num_topics)
X_test = get_topic_vector(lda_models[25], test_corpus, num_topics)
y_train = train_df["helpfulness_cat"]
y_test = test_true_labels["helpfulness_cat"]

### SentenceTransformer

In [None]:
from sentence_transformers import SentenceTransformer

#model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('paraphrase-mpnet-base-v2')

# encode
X_train = model.encode(train_df["imdb_user_review"].tolist(), show_progress_bar=True)
X_test = model.encode(test_df["imdb_user_review"].tolist(), show_progress_bar=True)
y_train = train_df["helpfulness_cat"]
y_test = test_true_labels["helpfulness_cat"]

Batches:   0%|          | 0/337 [00:00<?, ?it/s]

Batches:   0%|          | 0/159 [00:00<?, ?it/s]

In [4]:
X_train.shape

(10755, 768)

### Encoder fusion

#### using 'all-MiniLM-L6-v2' and 'paraphrase-mpnet-base-v2' model

In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np

# set model
model1 = SentenceTransformer('all-MiniLM-L6-v2')
model2 = SentenceTransformer('paraphrase-mpnet-base-v2')

# encode
emb1_train = model1.encode(train_df["imdb_user_review"].tolist(), show_progress_bar=True)
emb1_test = model1.encode(test_df["imdb_user_review"].tolist(), show_progress_bar=True)
emb2_train = model2.encode(train_df["imdb_user_review"].tolist(), show_progress_bar=True)
emb2_test = model2.encode(test_df["imdb_user_review"].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/337 [00:00<?, ?it/s]

Batches:   0%|          | 0/159 [00:00<?, ?it/s]

Batches:   0%|          | 0/337 [00:00<?, ?it/s]

Batches:   0%|          | 0/159 [00:00<?, ?it/s]

In [4]:
emb1_train.shape

(10755, 384)

#### topic modeling probability with 25 topics

In [5]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import LdaModel

# Download NLTK stopwords
nltk.download("stopwords")

# Initialize stopword set and stemmer
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

# Preprocessing function:
# - lowercase and tokenize - remove stopwords - apply stemming
def preprocess(text):
    tokens = simple_preprocess(text, deacc=True)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing to training and testing reviews
X_train_tokens = train_df["imdb_user_review"].astype(str).apply(preprocess)
X_test_tokens = test_df["imdb_user_review"].astype(str).apply(preprocess)

# Create dictionary and bag-of-words corpus
dictionary = Dictionary(X_train_tokens)
train_corpus = [dictionary.doc2bow(tokens) for tokens in X_train_tokens]
test_corpus = [dictionary.doc2bow(tokens) for tokens in X_test_tokens]

# Train LDA model with 25 topics
num_topics = 25
lda = LdaModel(corpus=train_corpus,
               id2word=dictionary,
               num_topics=num_topics,
               passes=10,
               random_state=38)

# Function to convert each document into a topic distribution vector
def get_topic_vector(model, corpus, num_topics):
    topic_vecs = []
    for doc_bow in corpus:
        doc_topics = model.get_document_topics(doc_bow, minimum_probability=0)
        vec = [prob for _, prob in sorted(doc_topics)]
        topic_vecs.append(vec)
    return np.array(topic_vecs)

# Get topic distribution vectors for training and testing sets
X_train_topic = get_topic_vector(lda, train_corpus, num_topics)
X_test_topic = get_topic_vector(lda, test_corpus, num_topics)

[nltk_data] Downloading package stopwords to /Users/adam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
X_train_topic.shape

(10755, 25)

#### TF-IDF encoding

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=100, stop_words='english')
X_train_tfidf = tfidf.fit_transform(train_df["imdb_user_review"])
X_test_tfidf = tfidf.transform(test_df["imdb_user_review"])

In [8]:
X_train_tfidf.shape

(10755, 100)

#### conbine vector

In [9]:
# conbined vector
X_train = np.concatenate([emb1_train, emb2_train, X_train_topic, X_train_tfidf.toarray()], axis=1)
X_test = np.concatenate([emb1_test, emb2_test, X_test_topic, X_test_tfidf.toarray()], axis=1)
y_train = train_df["helpfulness_cat"]
y_test = test_true_labels["helpfulness_cat"]

In [76]:
X_train.shape

(10755, 1277)

## Classifier

### Random forest

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))

RF Accuracy: 0.7809110629067245


### Neural Network classifier

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model_nn = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_nn.fit(X_train, y_train, epochs=15, batch_size=32, validation_split=0.1)


Epoch 1/15


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7677 - loss: 0.5443 - val_accuracy: 0.7007 - val_loss: 0.5896
Epoch 2/15
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 509us/step - accuracy: 0.7804 - loss: 0.4654 - val_accuracy: 0.7072 - val_loss: 0.5975
Epoch 3/15
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 481us/step - accuracy: 0.7964 - loss: 0.4478 - val_accuracy: 0.7072 - val_loss: 0.6182
Epoch 4/15
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 583us/step - accuracy: 0.8143 - loss: 0.4145 - val_accuracy: 0.7063 - val_loss: 0.6197
Epoch 5/15
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 465us/step - accuracy: 0.8335 - loss: 0.3788 - val_accuracy: 0.7035 - val_loss: 0.6241
Epoch 6/15
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 483us/step - accuracy: 0.8518 - loss: 0.3492 - val_accuracy: 0.6803 - val_loss: 0.6573
Epoch 7/15
[1m303/303[0m [

<keras.src.callbacks.history.History at 0x43c90ab40>

In [10]:
import tensorflow as tf
from tensorflow.keras.layers import Flatten, Dense, BatchNormalization, Dropout, Activation
from tensorflow.keras.regularizers import l2

input_shape=(X_train.shape[1],)

inputs = tf.keras.Input(shape=input_shape)
x = Dense(384, activation=None, kernel_regularizer=l2(0.01))(inputs)
x = BatchNormalization()(x)
x = Activation("relu")(x)

x = Dropout(0.5)(x)  # 50% dropout

x = Dense(192, activation=None, kernel_regularizer=l2(0.01))(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)

x = Dropout(0.5)(x)  # 50% dropout

x = Dense(96, activation=None, kernel_regularizer=l2(0.01))(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)

x = Dropout(0.5)(x)  # 50% dropout

outputs = Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs)

In [11]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.05)

Epoch 1/50
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7131 - loss: 6.7923 - val_accuracy: 0.7268 - val_loss: 2.0354
Epoch 2/50
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7786 - loss: 1.5969 - val_accuracy: 0.7454 - val_loss: 1.0745
Epoch 3/50
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7856 - loss: 0.9316 - val_accuracy: 0.7212 - val_loss: 0.8954
Epoch 4/50
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7857 - loss: 0.7771 - val_accuracy: 0.7435 - val_loss: 0.8720
Epoch 5/50
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7841 - loss: 0.7706 - val_accuracy: 0.7305 - val_loss: 0.8638
Epoch 6/50
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7866 - loss: 0.7253 - val_accuracy: 0.6803 - val_loss: 0.8793
Epoch 7/50
[1m320/320[0m 

<keras.src.callbacks.history.History at 0x6e9e57ad0>

In [12]:
# predict the testing data
y_pred_nn = model.predict(X_test)

# convert probability to 0 and 1, using threshold = 0.5
y_pred_nn_binary = (y_pred_nn > 0.5).astype(int).flatten()


[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 701us/step


In [13]:
from sklearn.metrics import accuracy_score

print("NN Accuracy:", accuracy_score(y_test, y_pred_nn_binary))


NN Accuracy: 0.7990534411358706


In [14]:
from sklearn.metrics import classification_report

# Print classification report (includes precision, recall, f1-score, and support)
print(classification_report(y_test, y_pred_nn_binary))

              precision    recall  f1-score   support

         0.0       0.59      0.40      0.47      1156
         1.0       0.84      0.92      0.88      3915

    accuracy                           0.80      5071
   macro avg       0.71      0.66      0.68      5071
weighted avg       0.78      0.80      0.78      5071

