# NLP Project Fake News
## Data Preprocessing


In [1]:
import numpy as np
import pandas as pd
import re
import string
import math
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# We Will use 2 Column
df = pd.read_csv('news.csv', usecols=['text','label']) 

# Drop rows with any empty cells
df.dropna(
    axis=0,
    how='any',
    thresh=None,
    subset=None,
    inplace=True
)

# Encoding Label Column
df['class'] = np.where(df['label']=='FAKE',0,1)

#Removing Duplicate
df.drop_duplicates(inplace=True)

#Test Data
print(df['text'].head())

0    Daniel Greenfield, a Shillman Journalism Fello...
1    Google Pinterest Digg Linkedin Reddit Stumbleu...
2    U.S. Secretary of State John F. Kerry said Mon...
3    — Kaydee King (@KaydeeKing) November 9, 2016 T...
4    It's primary day in New York and front-runners...
Name: text, dtype: object


In [2]:
# Tokenization --> Expand Contractions
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
} 

# Tokenization Regular expression To find contractions
contractions_RE = re.compile('(%s)' % '|'.join(contractions.keys()))

def ExpandContractions(text,contractions = contractions):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_RE.sub(replace, text)

# Applay Expanding Contractions
df['text'] = df['text'].apply(lambda x:ExpandContractions(x))

In [3]:
# Remove punctuation
df['text'] = df['text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))

In [4]:
# Lower Case
df['text'] = df['text'].str.lower()

In [5]:
# Remove Numbers --> remove words containing digits
df['text'] = df['text'].apply(lambda x: re.sub('W*dw*','',x))

# Replace Any thing not Digite or char with spaces
df['text'] = (df['text'].str.replace('[^A-Za-z0-9\s]','', regex=True).str.replace('\n','', regex=True).str.replace('\s+',' ', regex=True)) 

In [6]:
# remove stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('subject')

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

df['text'] = df['text'].apply(lambda x: remove_stopwords(x))

In [7]:
# Rephrase text ==> url
df['text'] = df['text'].apply(lambda x:re.sub('(http[s]?S+)|(w+.[A-Za-z]{2,4}S*)', 'urladd', x))

In [8]:
#stemming
stemmer = PorterStemmer()

def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["text"] = df["text"].apply(lambda x: stem_words(x))

In [9]:
#Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df["text"] = df["text"].apply(lambda text: lemmatize_words(text))

In [10]:
# Removing Extra Spaces
df["text"] = df["text"].apply(lambda text: re.sub(' +', ' ', text))

print(df["text"].head())
print(df["text"][0])

0    aniel greenfiel shillman journal fellourladdom...
1    googl pinterest igg linkein reit stumbleupon p...
2    u secretari state john f kerri sai monay stop ...
3    kaye king kayeek novemb 9 2016 lesson tonight ...
4    primari ay neurladd frontrunn hillari clinton ...
Name: text, dtype: object
aniel greenfiel shillman journal fellourladdom center neurladd urladd focus raical islam final stretch elect hillari roham clinton gone war fbi wor unpreceent thrown aroun often elect ought retir still unpreceent nomine major polit parti go war fbi that exactli hillari peopl one coma patient urladd urladdng hour cnn hospit be urladd assum fbi irector jame comey hillari oppon elect fbi uner attack everyon obama cnn hillari peopl circul letter attack comey current meia hit piec lambast target trump urladd surpris clinton alli start run attack fbi fbi leaership urladd entir lefturladd establish form lynch mob continu go hillari fbi creibil attack meia emocrat preemptiv hea result investig clin

In [11]:
print(df["label"].head())
print(df["class"].head())

#Removing Label Column
df = df.drop('label', axis=1)
print(df)

#Saving New CSV File
df.to_csv('new_news', index=False)

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object
0    0
1    0
2    1
3    0
4    1
Name: class, dtype: int32
                                                   text  class
0     aniel greenfiel shillman journal fellourladdom...      0
1     googl pinterest igg linkein reit stumbleupon p...      0
2     u secretari state john f kerri sai monay stop ...      1
3     kaye king kayeek novemb 9 2016 lesson tonight ...      0
4     primari ay neurladd frontrunn hillari clinton ...      1
...                                                 ...    ...
6330  state epart tol republican nation committe cou...      1
6331  p pb shoul stan plutocrat pentagon post oct 27...      0
6332  antitrump protest tool oligarchi reform alurla...      0
6333  ai ababa ethiopia presient obama conven meet l...      1
6334  jeb bush suenli attack trump here mattersjeb b...      1

[6060 rows x 2 columns]
