In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet', 'unsupervised': 'plain_text/unsupervised-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])

In [3]:
df.head(10)

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
5,I would put this at the top of my list of film...,0
6,Whoever wrote the screenplay for this movie ob...,0
7,"When I first saw a glimpse of this movie, I qu...",0
8,"Who are these ""They""- the actors? the filmmake...",0
9,This is said to be a personal film for Peter B...,0


In [4]:
df['label'].value_counts()

label
0    12500
1    12500
Name: count, dtype: int64

In [5]:
X = df.text
y = df.label
print(X.shape)
print(y.shape)

(25000,)
(25000,)


In [6]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(20000,)
(5000,)
(20000,)
(5000,)


In [8]:
vect = CountVectorizer()

In [9]:
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [10]:
X_train_dtm = vect.fit_transform(X_train)

In [11]:
X_train_dtm

<20000x68268 sparse matrix of type '<class 'numpy.int64'>'
	with 2752639 stored elements in Compressed Sparse Row format>

In [12]:
X_test_dtm = vect.transform(X_test)
X_test_dtm

<5000x68268 sparse matrix of type '<class 'numpy.int64'>'
	with 686126 stored elements in Compressed Sparse Row format>

In [13]:
nb = MultinomialNB()

In [14]:
%time nb.fit(X_train_dtm, y_train)

CPU times: total: 15.6 ms
Wall time: 16 ms


In [15]:
y_pred_class = nb.predict(X_test_dtm)

In [16]:
metrics.accuracy_score(y_test, y_pred_class)

0.846

In [17]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[2202,  313],
       [ 457, 2028]], dtype=int64)

In [18]:
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob[:10]

array([1.49172620e-05, 1.00000000e+00, 1.33330375e-10, 9.99999817e-01,
       1.00000000e+00, 3.74762189e-07, 2.07993798e-37, 1.00000000e+00,
       3.17273536e-09, 9.72775501e-02])

In [19]:
metrics.roc_auc_score(y_test, y_pred_prob)

0.9207510670384134

In [20]:
import torch
import torch.nn as nn
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import torch.nn.functional as F
import random

In [21]:
data = df['text'].tolist()
data[:10]

['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [22]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [23]:
max_epochs = 5
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1)

model.build_vocab(tagged_data)

In [24]:
for epoch in range(max_epochs):
    print("iteration {0}".format(epoch))
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    model.alpha -= 0.0002
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
Model Saved


In [25]:
df1 = df[df['label'] == 0]    
df2 = df[df['label'] == 1] 

In [26]:
len(df1)

12500

In [27]:
len(df2)

12500

In [28]:
df1 = df1[:2000]
df2 = df2[:2000]

In [29]:
df = pd.concat([df1, df2], ignore_index=True)
df.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [30]:
df['label'].value_counts()

label
0    2000
1    2000
Name: count, dtype: int64

In [31]:
data = df['text'].tolist()
data[:5]

['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [32]:
s = pd.Series(data)
s[:5]

0    I rented I AM CURIOUS-YELLOW from my video sto...
1    "I Am Curious: Yellow" is a risible and preten...
2    If only to avoid making this type of film in t...
3    This film was probably inspired by Godard's Ma...
4    Oh, brother...after hearing about this ridicul...
dtype: object

In [33]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [34]:
s_upd = s.apply(preprocess)
s_upd[:5]

0    rented curiousyellow video store controversy s...
1    curious yellow risible pretentious steaming pi...
2    avoid making type film future film interesting...
3    film probably inspired godards masculin fémini...
4    oh brotherafter hearing ridiculous film umptee...
dtype: object

In [35]:
mylist = s_upd.tolist()
mylist[2]

'avoid making type film future film interesting experiment tells cogent storybr br one might feel virtuous sitting thru touches many important issues without discernable motive viewer comes away new perspectives unless one comes one ones mind wanders invariably pointless filmbr br one might better spend ones time staring window tree growingbr br'

In [36]:
data_0 = [[el] for el in mylist]
data_0[:5]

[['rented curiousyellow video store controversy surrounded first released 1967 also heard first seized us customs ever tried enter country therefore fan films considered controversial really see myselfbr br plot centered around young swedish drama student named lena wants learn everything life particular wants focus attentions making sort documentary average swede thought certain political issues vietnam war race issues united states asking politicians ordinary denizens stockholm opinions politics sex drama teacher classmates married menbr br kills curiousyellow 40 years ago considered pornographic really sex nudity scenes far even shot like cheaply made porno countrymen mind find shocking reality sex nudity major staple swedish cinema even ingmar bergman arguably answer good old boy john ford sex scenes filmsbr br commend filmmakers fact sex shown film shown artistic purposes rather shock people make money shown pornographic theaters america curiousyellow good film anyone wanting stud

In [37]:
newlist = []
for i in mylist:
    i = i.split()
    newlist.append(i)

newlist[:5]

[['rented',
  'curiousyellow',
  'video',
  'store',
  'controversy',
  'surrounded',
  'first',
  'released',
  '1967',
  'also',
  'heard',
  'first',
  'seized',
  'us',
  'customs',
  'ever',
  'tried',
  'enter',
  'country',
  'therefore',
  'fan',
  'films',
  'considered',
  'controversial',
  'really',
  'see',
  'myselfbr',
  'br',
  'plot',
  'centered',
  'around',
  'young',
  'swedish',
  'drama',
  'student',
  'named',
  'lena',
  'wants',
  'learn',
  'everything',
  'life',
  'particular',
  'wants',
  'focus',
  'attentions',
  'making',
  'sort',
  'documentary',
  'average',
  'swede',
  'thought',
  'certain',
  'political',
  'issues',
  'vietnam',
  'war',
  'race',
  'issues',
  'united',
  'states',
  'asking',
  'politicians',
  'ordinary',
  'denizens',
  'stockholm',
  'opinions',
  'politics',
  'sex',
  'drama',
  'teacher',
  'classmates',
  'married',
  'menbr',
  'br',
  'kills',
  'curiousyellow',
  '40',
  'years',
  'ago',
  'considered',
  'pornogr

In [38]:
add_lst = []
for i in newlist:
    add_lst.append(random.choice(i))

add_lst[:5]

['shock', 'insides', 'motive', 'subject', 'lots']

In [39]:
c = 0
while c < len(add_lst):
    for i in data_0:
        i.append(add_lst[c])
        c += 1
     
data_0[:5]

[['rented curiousyellow video store controversy surrounded first released 1967 also heard first seized us customs ever tried enter country therefore fan films considered controversial really see myselfbr br plot centered around young swedish drama student named lena wants learn everything life particular wants focus attentions making sort documentary average swede thought certain political issues vietnam war race issues united states asking politicians ordinary denizens stockholm opinions politics sex drama teacher classmates married menbr br kills curiousyellow 40 years ago considered pornographic really sex nudity scenes far even shot like cheaply made porno countrymen mind find shocking reality sex nudity major staple swedish cinema even ingmar bergman arguably answer good old boy john ford sex scenes filmsbr br commend filmmakers fact sex shown film shown artistic purposes rather shock people make money shown pornographic theaters america curiousyellow good film anyone wanting stud

In [40]:
res_lst = []
for l in data_0:
    res_lst.append(tuple(l))

res_lst[:5]

[('rented curiousyellow video store controversy surrounded first released 1967 also heard first seized us customs ever tried enter country therefore fan films considered controversial really see myselfbr br plot centered around young swedish drama student named lena wants learn everything life particular wants focus attentions making sort documentary average swede thought certain political issues vietnam war race issues united states asking politicians ordinary denizens stockholm opinions politics sex drama teacher classmates married menbr br kills curiousyellow 40 years ago considered pornographic really sex nudity scenes far even shot like cheaply made porno countrymen mind find shocking reality sex nudity major staple swedish cinema even ingmar bergman arguably answer good old boy john ford sex scenes filmsbr br commend filmmakers fact sex shown film shown artistic purposes rather shock people make money shown pornographic theaters america curiousyellow good film anyone wanting stud

In [41]:
vocab = set()
for context, target in res_lst:
    vocab.update(context.split())
    vocab.update([target])
word_to_ix = {word: i for i, word in enumerate(vocab)}

items = list(word_to_ix.items())
items[:5]

[('excerpts', 0),
 ('rajpals', 1),
 ('imported', 2),
 ('charming', 3),
 ('1998', 4)]

In [42]:
training_data = [(context.split(), target) for context, target in res_lst]
training_data[:3]

[(['rented',
   'curiousyellow',
   'video',
   'store',
   'controversy',
   'surrounded',
   'first',
   'released',
   '1967',
   'also',
   'heard',
   'first',
   'seized',
   'us',
   'customs',
   'ever',
   'tried',
   'enter',
   'country',
   'therefore',
   'fan',
   'films',
   'considered',
   'controversial',
   'really',
   'see',
   'myselfbr',
   'br',
   'plot',
   'centered',
   'around',
   'young',
   'swedish',
   'drama',
   'student',
   'named',
   'lena',
   'wants',
   'learn',
   'everything',
   'life',
   'particular',
   'wants',
   'focus',
   'attentions',
   'making',
   'sort',
   'documentary',
   'average',
   'swede',
   'thought',
   'certain',
   'political',
   'issues',
   'vietnam',
   'war',
   'race',
   'issues',
   'united',
   'states',
   'asking',
   'politicians',
   'ordinary',
   'denizens',
   'stockholm',
   'opinions',
   'politics',
   'sex',
   'drama',
   'teacher',
   'classmates',
   'married',
   'menbr',
   'br',
   'kills'

In [43]:
vocab_size = len(vocab)
embedding_dim = 7

In [44]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size) 
        self.context = nn.Linear(embed_size, vocab_size) 
        
    def forward(self, inputs):                   
        embeds = self.embedding(inputs)          
        embeds_mean = torch.mean(embeds, dim=0)  
        out = self.context(embeds_mean)          
        log_probs = F.log_softmax(out, dim=0)    
        return log_probs

In [45]:
model = CBOW(len(vocab), embedding_dim)
word_emb = model.embedding(torch.tensor(word_to_ix["controversial"], dtype=torch.long))

print(word_emb.shape)

print(word_emb)
print(word_emb.mean())

x_1d = word_emb.unsqueeze(0).unsqueeze(0)
print(x_1d.shape)

cnn1d_1 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=2, bias=False)
print(cnn1d_1.weight)
print(cnn1d_1.bias)

y1 = cnn1d_1(x_1d)
print(y1)

torch.Size([7])
tensor([-0.4834, -2.0695, -0.2375,  0.5168,  0.0485,  0.4686, -0.2625],
       grad_fn=<EmbeddingBackward0>)
tensor(-0.2884, grad_fn=<MeanBackward0>)
torch.Size([1, 1, 7])
Parameter containing:
tensor([[[0.6491, 0.4304]]], requires_grad=True)
None
tensor([[[-1.2045, -1.4454,  0.0683,  0.3563,  0.2331,  0.1912]]],
       grad_fn=<ConvolutionBackward0>)


In [46]:
LEARNING_RATE = 0.09
EPOCHS = 5

torch.manual_seed(42)
model = CBOW(len(vocab), embedding_dim)

loss_function = nn.NLLLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    total_loss = 0
    for context, target in training_data:
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        target_idx = torch.tensor([word_to_ix[target]], dtype=torch.long)
        
        model.zero_grad()
        
        log_probs = model(context_idxs)
        
        loss = loss_function(log_probs.view(1, -1), target_idx)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {total_loss}')

Epoch 1, Loss: 48917.68363058567
Epoch 2, Loss: 45231.88095923513
Epoch 3, Loss: 39526.51879732163
Epoch 4, Loss: 39979.52022563676
Epoch 5, Loss: 39585.4848060075


In [47]:
new_data = pd.DataFrame(s_upd, columns=['text'])
new_data[:5]

Unnamed: 0,text
0,rented curiousyellow video store controversy s...
1,curious yellow risible pretentious steaming pi...
2,avoid making type film future film interesting...
3,film probably inspired godards masculin fémini...
4,oh brotherafter hearing ridiculous film umptee...


In [48]:
new_data['label'] = df['label']
new_data[:5]

Unnamed: 0,text,label
0,rented curiousyellow video store controversy s...,0
1,curious yellow risible pretentious steaming pi...,0
2,avoid making type film future film interesting...,0
3,film probably inspired godards masculin fémini...,0
4,oh brotherafter hearing ridiculous film umptee...,0


In [49]:
len(new_data)

4000

In [50]:
train = new_data.text
test = new_data.label
print(train.shape)
print(test.shape)

(4000,)
(4000,)


In [51]:
X_train, X_test, y_train, y_test = train_test_split(train, test, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3200,)
(800,)
(3200,)
(800,)


In [52]:
new_vect = CountVectorizer()

In [53]:
new_vect.fit(X_train)
X_train_dtm_upd = new_vect.transform(X_train)

In [54]:
X_train_dtm_upd = new_vect.fit_transform(X_train)

In [55]:
X_train_dtm_upd

<3200x37705 sparse matrix of type '<class 'numpy.int64'>'
	with 323525 stored elements in Compressed Sparse Row format>

In [56]:
X_test_dtm_upd = new_vect.transform(X_test)
X_test_dtm_upd

<800x37705 sparse matrix of type '<class 'numpy.int64'>'
	with 76098 stored elements in Compressed Sparse Row format>

In [57]:
new_nb = MultinomialNB()

In [58]:
%time new_nb.fit(X_train_dtm_upd, y_train)

CPU times: total: 0 ns
Wall time: 4 ms


In [59]:
y_pred_class_new = new_nb.predict(X_test_dtm_upd)

In [60]:
metrics.accuracy_score(y_test, y_pred_class_new)

0.88125

In [61]:
metrics.confusion_matrix(y_test, y_pred_class_new)

array([[379,  43],
       [ 52, 326]], dtype=int64)

In [62]:
y_pred_prob_new = new_nb.predict_proba(X_test_dtm_upd)[:, 1]
y_pred_prob_new[:10]

array([1.28659440e-09, 1.00000000e+00, 5.81676743e-10, 1.24655543e-02,
       1.00000000e+00, 7.12552883e-03, 4.44752989e-05, 1.65171215e-02,
       1.30677982e-03, 2.19626591e-08])

In [63]:
metrics.roc_auc_score(y_test, y_pred_prob_new)

0.9384042980014544