In [1]:
# imports
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

In [2]:
# loading data
df = pd.read_csv('/content/fake_real_news_dataset.csv')
df.head()

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,It's primary day in New York and front-runners...,REAL


In [3]:
# getting info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2422 entries, 0 to 2421
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2422 non-null   object
 1   label   2422 non-null   object
dtypes: object(2)
memory usage: 38.0+ KB


In [4]:
# looking at text
df['text'][0]



In [5]:
# proprocessing text; lower case, tokenization
df['text'] = df['text'].apply(simple_preprocess)
df.head()

Unnamed: 0,text,label
0,"[daniel, greenfield, shillman, journalism, fel...",FAKE
1,"[google, pinterest, digg, linkedin, reddit, st...",FAKE
2,"[secretary, of, state, john, kerry, said, mond...",REAL
3,"[kaydee, king, kaydeeking, november, the, less...",FAKE
4,"[it, primary, day, in, new, york, and, front, ...",REAL


In [6]:
# model initialization; CBOW
model = Word2Vec(window=10, sg=0, vector_size=50, seed=42)

In [7]:
# building vocab
model.build_vocab(corpus_iterable=df['text'])

In [8]:
# model training
model.train(corpus_iterable=df['text'], total_examples=model.corpus_count, epochs=10)

(13478191, 17553900)

#### Another way to build vocab, and train the model:
``` 
model = Word2Vec(sentences=df['text'], epochs=10, window=10, sg=0, vector_size=50, seed=42)
```

In [9]:
# checking similarity between words
model.wv.similarity('snake', 'cat')

0.020924063

In [10]:
# checking cosine distance between words
model.wv.distance('snake', 'cat')

0.9790759366005659

In [11]:
# finding odd one out
model.wv.doesnt_match(['snake', 'cat', 'dog'])

'snake'

In [12]:
# saving model
#model.save('MyW2V.model')

In [13]:
# loading model
#loaded_model = Word2Vec.load('MyW2V.model')

In [14]:
# rechecking similarity between words
#model.wv.similarity('snake', 'cat')