In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip
/kaggle/input/word2vec-nlp-tutorial/sampleSubmission.csv
/kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip
/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip


In [2]:
%%time
import numpy as np, pandas as pd, re, nltk, tqdm
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression

CPU times: user 14.4 s, sys: 1.21 s, total: 15.6 s
Wall time: 17.4 s


In [3]:
data_train = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip', compression='zip', sep='\t')
data_test = pd.read_csv('/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip', compression='zip', sep='\t')
data_train.head(2)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."


In [4]:
data_test.head(2)

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...


In [5]:
tqdm.tqdm.pandas()

def preprocessing(text):
    text = re.sub(r'<.*?>', '', text)                
    text = re.sub(r'[^a-zA-Z]', ' ', text)           
    text = text.lower()
    words = text.split()
    words = [w for w in words if w not in stopwords.words("english")]
    return words

data_train['tokens'] = data_train['review'].progress_apply(preprocessing)
data_train.head(2)

100%|██████████| 25000/25000 [07:23<00:00, 56.40it/s]


Unnamed: 0,id,sentiment,review,tokens
0,5814_8,1,With all this stuff going down at the moment w...,"[stuff, going, moment, mj, started, listening,..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","[classic, war, worlds, timothy, hines, enterta..."


In [6]:
data_test['tokens'] = data_test['review'].progress_apply(preprocessing)
data_test.head()

100%|██████████| 25000/25000 [07:13<00:00, 57.68it/s]


Unnamed: 0,id,review,tokens
0,12311_10,Naturally in a film who's main themes are of m...,"[naturally, film, main, themes, mortality, nos..."
1,8348_2,This movie is a disaster within a disaster fil...,"[movie, disaster, within, disaster, film, full..."
2,5828_4,"All in all, this is a movie for kids. We saw i...","[movie, kids, saw, tonight, child, loved, one,..."
3,7186_2,Afraid of the Dark left me with the impression...,"[afraid, dark, left, impression, several, diff..."
4,12128_7,A very accurate depiction of small time mob li...,"[accurate, depiction, small, time, mob, life, ..."


In [7]:
tqdm.tqdm.pandas()

model_emb = Word2Vec(
    sentences=data_train['tokens'],
    vector_size=200,  
    window=5,
    min_count=2,
    workers=4
)

def sentence_embedding(words, model_emb):
    valid_words = [word for word in words if word in model_emb.wv]
    if not valid_words:
        return np.zeros(model_emb.vector_size)
    return np.mean([model_emb.wv[word] for word in valid_words], axis=0)

data_train['embedding'] = data_train['tokens'].progress_apply(lambda x: sentence_embedding(x, model_emb))
data_train.head()

100%|██████████| 25000/25000 [00:05<00:00, 4671.95it/s]


Unnamed: 0,id,sentiment,review,tokens,embedding
0,5814_8,1,With all this stuff going down at the moment w...,"[stuff, going, moment, mj, started, listening,...","[0.07843066, -0.09221629, -0.30745858, 0.10427..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","[classic, war, worlds, timothy, hines, enterta...","[-0.018323556, 0.028792864, -0.24601065, 0.055..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,"[film, starts, manager, nicholas, bell, giving...","[0.0404079, -0.106676795, -0.10827111, 0.05616..."
3,3630_4,0,It must be assumed that those who praised this...,"[must, assumed, praised, film, greatest, filme...","[0.12994325, -0.12376767, -0.18501484, -0.0199..."
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,"[superbly, trashy, wondrously, unpretentious, ...","[0.059991714, -0.20014022, -0.2583028, -0.0200..."


In [8]:
data_test['embedding'] = data_test['tokens'].progress_apply(lambda x : sentence_embedding(x, model_emb))
data_test.head()

100%|██████████| 25000/25000 [00:05<00:00, 4649.75it/s]


Unnamed: 0,id,review,tokens,embedding
0,12311_10,Naturally in a film who's main themes are of m...,"[naturally, film, main, themes, mortality, nos...","[0.10522094, -0.056526225, -0.40711153, -0.056..."
1,8348_2,This movie is a disaster within a disaster fil...,"[movie, disaster, within, disaster, film, full...","[0.016602568, -0.0726361, -0.3707933, 0.039762..."
2,5828_4,"All in all, this is a movie for kids. We saw i...","[movie, kids, saw, tonight, child, loved, one,...","[0.023037326, -0.11910039, -0.50863296, 0.1035..."
3,7186_2,Afraid of the Dark left me with the impression...,"[afraid, dark, left, impression, several, diff...","[-0.08999643, -0.22795954, -0.46400604, 0.1682..."
4,12128_7,A very accurate depiction of small time mob li...,"[accurate, depiction, small, time, mob, life, ...","[0.07328355, -0.120194644, -0.40261817, -0.060..."


In [9]:
X_train = np.stack(data_train['embedding'].values)
y_train = data_train['sentiment'].values
X_test = np.stack(data_test['embedding'].values)
print('done')

done


In [10]:
reg = LogisticRegression(max_iter=1000)
reg.fit(X_train, y_train)

In [11]:
y_pred = reg.predict(X_test)

In [14]:
submission = pd.DataFrame({
    'id': data_test['id'].values, 
    'sentiment': y_pred  
})
submission.to_csv('submission.csv', index=False)