In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
import numpy as np

In [2]:
splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet', 'unsupervised': 'plain_text/unsupervised-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])

In [3]:
df.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [4]:
data = df['text'].tolist()
data[:3]

['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [5]:
s = pd.Series(data)
s[:3]

0    I rented I AM CURIOUS-YELLOW from my video sto...
1    "I Am Curious: Yellow" is a risible and preten...
2    If only to avoid making this type of film in t...
dtype: object

In [6]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [7]:
s_upd = s.apply(preprocess)
s_upd[:3]

0    rented curiousyellow video store controversy s...
1    curious yellow risible pretentious steaming pi...
2    avoid making type film future film interesting...
dtype: object

In [8]:
clean_df = pd.DataFrame(s_upd, columns=['text'])
clean_df['label'] = df['label']
clean_df.head()

Unnamed: 0,text,label
0,rented curiousyellow video store controversy s...,0
1,curious yellow risible pretentious steaming pi...,0
2,avoid making type film future film interesting...,0
3,film probably inspired godards masculin fémini...,0
4,oh brotherafter hearing ridiculous film umptee...,0


In [9]:
%%time

stop_words = set(stopwords.words('english'))

tokenized_docs = [nltk.word_tokenize(text.lower()) for text in clean_df['text']]

preprocessed_docs = [
    [word for word in tokens if word.isalnum() and word not in stop_words]
    for tokens in tokenized_docs
]

CPU times: total: 7.23 s
Wall time: 7.23 s


In [10]:
X_train, X_test, y_train, y_test = train_test_split(preprocessed_docs, clean_df['label'], test_size=0.2, random_state=42)

In [11]:
train_tagged = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(X_train)]

In [12]:
%%time

doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=5)

doc2vec_model.build_vocab(train_tagged)

doc2vec_model.train(train_tagged, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

CPU times: total: 28.5 s
Wall time: 11.2 s


In [13]:
%%time

train_vectors = [doc2vec_model.dv[tag] for tag in range(len(X_train))]
test_vectors = [doc2vec_model.infer_vector(doc) for doc in X_test]

CPU times: total: 5.03 s
Wall time: 5.03 s


In [14]:
classifier = LogisticRegression()
classifier.fit(train_vectors, y_train)

In [15]:
predictions = classifier.predict(test_vectors)

In [16]:
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.80

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.81      0.80      2515
           1       0.81      0.79      0.80      2485

    accuracy                           0.80      5000
   macro avg       0.80      0.80      0.80      5000
weighted avg       0.80      0.80      0.80      5000



In [17]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer

In [18]:
classifier = LogisticRegression()

In [19]:
cv_results = cross_validate(
    classifier,
    np.array(train_vectors),
    y_train,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring={"accuracy": make_scorer(accuracy_score)},
    return_train_score=True,
)

In [20]:
print("Cross-validation results:")
print(f"Train Accuracy: {np.mean(cv_results['train_accuracy']):.2f}")
print(f"Test Accuracy: {np.mean(cv_results['test_accuracy']):.2f}")

Cross-validation results:
Train Accuracy: 0.81
Test Accuracy: 0.80
