# Preprocessing Data: DataFrame with Pandas

The data is split into 2 CSV files, true and fake news. We will first view the two files seperately and then merge in order to split into train and test datasets. 

In [None]:
from google.colab import files
uploaded = files.upload()

In [3]:
# Import dependencies
import io
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.feature_extraction.text import CountVectorizer 
#from sklearn.preprocessing import StandardScaler
import pandas as pd
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix


In [2]:
# Spark dependencies 
import os
spark_version = 'spark-3.0.3'

In [4]:
#  Import and read the Fake.csv.
fake_news_df = pd.read_csv(io.BytesIO(uploaded['Fake.csv']))
fake_news_df.head()

NameError: ignored

In [4]:
fake_news_df.nunique()

title      17903
text       17455
subject        6
date        1681
dtype: int64

In [5]:
fake_news_df['subject'].value_counts()

News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: subject, dtype: int64

In [10]:
#  Import and read the True.csv.
true_news_df = pd.read_csv(io.BytesIO(uploaded['True.csv']))
true_news_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [11]:
true_news_df.nunique()

title      20826
text       21192
subject        2
date         716
dtype: int64

In [12]:
true_news_df['subject'].value_counts()

politicsNews    11272
worldnews       10145
Name: subject, dtype: int64

In [13]:
# Label fake and real data 0 and 1 
true_news_df['label']=1
fake_news_df['label']=0

In [14]:
# Merge dfs
frames = [true_news_df, fake_news_df]
real_or_fake_df = pd.concat(frames)

# Drop subject columns as will skew the data 
real_or_fake_df=real_or_fake_df.drop(columns=['subject'],axis=1)
# Drop date column as data not needed 
real_or_fake_df=real_or_fake_df.drop(columns=['date'],axis=1)

Unnamed: 0,title,text,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,"December 29, 2017",1


In [15]:
real_or_fake_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   44898 non-null  object
 1   text    44898 non-null  object
 2   date    44898 non-null  object
 3   label   44898 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


In [None]:
real_or_fake_df.duplicated().sum()

In [None]:
real_or_fake_df.isnull().sum()

In [None]:
real_or_fake_df.drop_duplicates()

In [None]:
real_or_fake_df.head()

In [None]:
# ANY VISUALISATIONS?
# plot time period
# plot subjects 
# word clouds?
# pie chart fake and true 

# Preprocessing Data: NLP with tokenize

In [None]:
# tokenize titles into words 
title_tokens = word_tokenize(real_or_fake_df['title'])
lower_title_tokens = [t.lower() for t in title_tokens]

# Frequency list of words in titles
freq_words = [len(title_tokens) for t in title_tokens]

In [None]:
# plot hist of sentence lengths for titles 
plt.hist(freq_words)
plt.show()

In [None]:
# alpha only and remove stop words 
alpha_only = [t for t in lower_title_tokens if t.isalpha()]
# no_stop_words = [t for t in alpha_only if t not in english_stops]

In [None]:
# Bag of words to see most common tokens (SHAN ONLY USE THIS IF DF DOESN'T WORK)
# instantiate WordNetLennatizer
wordnet_lemmatizer = WordNetLemmatizer()

# lemmatize into new list 
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in alpha_only]
bow = Counter(lemmatized)
print(bow.most_common(10))

# Train & Test Model

In [23]:
y = real_or_fake_df['label']
X = real_or_fake_df['title']
# shan change this to tokenized words 

In [24]:
# split into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# count vector - how many times does each word appear in titles?
# shan, go back and change count vc to TFIDF as it counts but also signifiers a higher importance of words 
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)

In [28]:
# Initalize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.fit_transform(X_test)

In [29]:
# Convert into df
# order by top weighthed words 
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())
tfidf_df.head()

ValueError: ignored

In [None]:
# Classification Model: Naive Bayes Classifier
# Multinomial Naive Bayes classifier has been used as the data is discrete 
nb_classifier = MultinomialNB()

nb_classifier.fit(tfidf_train, y_train)

pred = nb_classifier.predict(tfidf_test)

score = metrics.accuracy_score(y_test, pred)
print(score)

In [None]:
# Confusion Matrix
confusion_matrix = metrics.confusion_matrix(y_test, pred, labels=['fake', 'real'])
print(confusion_matrix)

In [None]:
# sequential model or another model to compare?

# Model Optimisation