# NLP: Text Classification using Spacy word Embeddings

# end-to-end project: fake and real news data

In [16]:
import pandas as pd


In [17]:
df = pd.read_csv("fake_or_real_news.csv")


In [18]:
print(df.shape)

(6335, 4)


In [19]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [20]:
print(df['label'].unique())

['FAKE' 'REAL']


In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [22]:
df = df[['text','label']]

In [23]:
df.head()

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,It's primary day in New York and front-runners...,REAL


In [24]:
print(df.shape)

(6335, 2)


In [25]:
#check for imbalance dataset
# check the distribution of data in our dataset
df['label'].value_counts()

label
REAL    3171
FAKE    3164
Name: count, dtype: int64

In [26]:
df['label'] = df['label'].str.lower()

# Create the 'label_num' column using the updated mapping dictionary
df['label_num'] = df['label'].map({'fake': 0, 'real': 1})

# Verify the result
print(df.head())

                                                text label  label_num
0  Daniel Greenfield, a Shillman Journalism Fello...  fake          0
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  fake          0
2  U.S. Secretary of State John F. Kerry said Mon...  real          1
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  fake          0
4  It's primary day in New York and front-runners...  real          1


In [27]:
# import spacy
# python -m spacy download en_core_web_lg/
# nlp = spacy.load("en_core_web_lg")

In [28]:
import spacy

In [29]:
nlp = spacy.load("en_core_web_lg")

In [31]:
df['vector'] = df['text'].apply(lambda text: nlp(text).vector)

In [32]:
len(df)

6335

In [33]:
df.head()

Unnamed: 0,text,label,label_num,vector
0,"Daniel Greenfield, a Shillman Journalism Fello...",fake,0,"[-1.3751823, 1.3421849, -2.3666484, 0.12908486..."
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,fake,0,"[-1.7449774, 0.93961924, -2.024867, 0.42536643..."
2,U.S. Secretary of State John F. Kerry said Mon...,real,1,"[-1.9426425, 1.0062195, -1.9992222, 0.20469022..."
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",fake,0,"[-1.9125352, -0.1481846, -1.1432766, 0.6861217..."
4,It's primary day in New York and front-runners...,real,1,"[-1.8516092, 1.3163909, -2.1726575, 1.2286776,..."


In [34]:
from sklearn.model_selection import train_test_split

X_train , X_test, y_train, y_test = train_test_split(df.vector.values, df.label_num, test_size=0.3, random_state=42
                                                     )

In [35]:
X_train.shape

(4434,)

In [37]:
X_test.shape

(1901,)

In [40]:
import numpy as np

X_train_stack = np.stack(X_train)
X_test_stack = np.stack(X_test)



In [42]:
X_test_stack.shape

(1901, 300)

In [44]:
# from sklearn.naive_bayes import MultinomialNB

# model = MultinomialNB()

# model.fit(X_train,y_train)

## It will show an error because of negative value so we need to scale the value