In [1]:
import pandas as pd

df = pd.read_json('sst2/train.jsonl', lines=True)
df

Unnamed: 0,text,label,label_text
0,"a stirring , funny and finally transporting re...",1,positive
1,apparently reassembled from the cutting-room f...,0,negative
2,they presume their audience wo n't sit still f...,0,negative
3,this is a visually stunning rumination on love...,1,positive
4,jonathan parker 's bartleby should have been t...,1,positive
...,...,...,...
6915,"painful , horrifying and oppressively tragic ,...",1,positive
6916,take care is nicely performed by a quintet of ...,0,negative
6917,"the script covers huge , heavy topics in a bla...",0,negative
6918,a seriously bad film with seriously warped log...,0,negative


# Preprocessing

In [2]:
import re
df.loc[:,"text"] = df.text.apply(lambda x : " ".join(re.findall('[\w]+',x)))
df

Unnamed: 0,text,label,label_text
0,a stirring funny and finally transporting re i...,1,positive
1,apparently reassembled from the cutting room f...,0,negative
2,they presume their audience wo n t sit still f...,0,negative
3,this is a visually stunning rumination on love...,1,positive
4,jonathan parker s bartleby should have been th...,1,positive
...,...,...,...
6915,painful horrifying and oppressively tragic thi...,1,positive
6916,take care is nicely performed by a quintet of ...,0,negative
6917,the script covers huge heavy topics in a bland...,0,negative
6918,a seriously bad film with seriously warped log...,0,negative


In [3]:
from stop_words import get_stop_words
stop_words = get_stop_words('en')

def remove_stopWords(s):
    s = ' '.join(word for word in s.split() if word not in stop_words)
    return s

df.loc[:,"text"] = df.text.apply(lambda x: remove_stopWords(x))
df = df['text'].values
df

array(['stirring funny finally transporting re imagining beauty beast 1930s horror films',
       'apparently reassembled cutting room floor given daytime soap',
       'presume audience wo n t sit still sociology lesson however entertainingly presented trot conventional science fiction elements bug eyed monsters futuristic women skimpy clothes',
       ...,
       'script covers huge heavy topics bland surfacey way n t offer insight instance good things happen bad people',
       'seriously bad film seriously warped logic writer director kurt wimmer screenplay level',
       'deliciously nonsensical comedy city coming apart seams'],
      dtype=object)

# Bag of Words

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
# Bag of Words (BoW)
vectorizer = CountVectorizer(max_features=10000, ngram_range=(1,2))
# TF-IDF (Advanced Variant of BoW)
X_bow = vectorizer.fit_transform(df)
print(X_bow)

  (0, 8444)	1
  (0, 3591)	1
  (0, 3323)	1
  (0, 9128)	1
  (0, 7100)	1
  (0, 741)	1
  (0, 726)	1
  (0, 4196)	1
  (0, 3313)	1
  (0, 742)	1
  (0, 4198)	1
  (1, 449)	1
  (1, 1948)	1
  (1, 7478)	1
  (1, 3425)	1
  (1, 3706)	1
  (1, 2018)	1
  (1, 8146)	1
  (1, 1949)	1
  (1, 7479)	1
  (1, 2019)	1
  (2, 578)	1
  (2, 9832)	1
  (2, 8033)	1
  (2, 8428)	1
  :	:
  (6917, 4225)	1
  (6917, 6178)	1
  (6917, 1820)	1
  (6917, 3972)	1
  (6917, 9068)	1
  (6917, 8935)	1
  (6917, 4481)	1
  (6918, 3194)	1
  (6918, 643)	1
  (6918, 7681)	1
  (6918, 2281)	1
  (6918, 9928)	1
  (6918, 5016)	1
  (6918, 5166)	1
  (6918, 9930)	1
  (6918, 650)	1
  (6918, 7833)	2
  (6918, 4849)	1
  (6919, 1566)	1
  (6919, 444)	1
  (6919, 1426)	1
  (6919, 2102)	1
  (6919, 1599)	1
  (6919, 6082)	1
  (6919, 7706)	1


# TF*IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
# TF-IDF (Advanced Variant of BoW)
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))  
X_tfidf = vectorizer.fit_transform(df)
print(X_tfidf)

  (0, 4198)	0.3739873028995298
  (0, 742)	0.36168251176103683
  (0, 3313)	0.22111024766658646
  (0, 4196)	0.2653898400016651
  (0, 726)	0.3377465211389492
  (0, 741)	0.2907564718841119
  (0, 7100)	0.22315715431006108
  (0, 9128)	0.3739873028995298
  (0, 3323)	0.28504504230953437
  (0, 3591)	0.19769132133306638
  (0, 8444)	0.31841412931499774
  (1, 2019)	0.3532158125064839
  (1, 7479)	0.3532158125064839
  (1, 1949)	0.3532158125064839
  (1, 8146)	0.2675643916435112
  (1, 2018)	0.33258019823412566
  (1, 3706)	0.2692134610980501
  (1, 3425)	0.3252150287036727
  (1, 7478)	0.27657863062850313
  (1, 1948)	0.3135936534163063
  (1, 449)	0.3007292079906677
  (2, 1062)	0.25647431286514016
  (2, 7649)	0.22424931351169652
  (2, 8034)	0.25647431286514016
  (2, 1490)	0.23614260196423162
  :	:
  (6917, 9635)	0.16783351379079345
  (6917, 3795)	0.29925545064847825
  (6917, 8933)	0.20004556745485566
  (6917, 869)	0.2330130664738384
  (6917, 4050)	0.2373519171392366
  (6917, 643)	0.1726826643402271
  (691

# Word2Vec

In [6]:
import gensim
VECTOR_SIZE = 100

## create list of lists of unigrams
lst_corpus = [sentence.split() for sentence in df]

## fit w2v (sg: 1 for skip-gram; otherwise CBOW.)
model = gensim.models.word2vec.Word2Vec(lst_corpus, vector_size=VECTOR_SIZE, window=8, min_count=1, sg=1, seed=0)

In [7]:
model.wv['funny']

array([ 0.02586458, -0.29192907, -0.07210118, -0.11194023,  0.31305358,
        0.06732993, -0.07566354, -0.2980053 ,  0.12715743,  0.112382  ,
        0.31278294,  0.2199297 ,  0.2016833 ,  0.35361734,  0.14256376,
       -0.18416218,  0.06981486, -0.13474175, -0.22682424,  0.11490394,
       -0.00442855,  0.20733055, -0.00165407, -0.15122357,  0.28087065,
        0.05662809,  0.06297065,  0.05603285, -0.12615006,  0.02428699,
       -0.00246318,  0.10985161,  0.329809  , -0.03493271, -0.28489682,
        0.00523466,  0.28881174, -0.05304515,  0.04194215,  0.23493198,
       -0.11176803, -0.32619745, -0.22386515, -0.03856778, -0.1076696 ,
       -0.34941316, -0.27522933, -0.20385996, -0.11895534,  0.0596831 ,
       -0.12157824,  0.02582511, -0.05987333,  0.47122777,  0.02611311,
       -0.27798212,  0.00471418,  0.05203452, -0.03038506,  0.08884189,
        0.06443454,  0.3796268 ,  0.1458289 , -0.15145631,  0.3785462 ,
       -0.02909073, -0.16436514,  0.06646606,  0.11534645,  0.03

In [8]:
model.wv.most_similar('funny', topn=10)

[('original', 0.9934704899787903),
 ('certainly', 0.993450939655304),
 ('fun', 0.9932791590690613),
 ('interesting', 0.9930195808410645),
 ('anything', 0.9928457736968994),
 ('understand', 0.9927155375480652),
 ('audience', 0.9921032190322876),
 ('worth', 0.9917160272598267),
 ('anyone', 0.991519570350647),
 ('done', 0.9911305904388428)]