In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
# Data loading
path = "data"
fake = pd.read_csv(path + "/Fake.csv")
true = pd.read_csv(path + "/True.csv")

print("Numbers of True news : ", true.shape[0])
print("Numbers of Fake news : ", fake.shape[0])

### Data cleaning and preparation
  
  * Concatenate True.csv and Fake.csv for data preparation
  * Drop unnecesssary columns
  * Convert to lower cases
  * Remove punctuation
  * Remove stopwords
  * Tokenize

In [None]:
# Add flag to track fake and true
true['target'] = 'true'
fake['target'] = 'fake'

# Concatenate dataframes
data = pd.concat([fake, true]).reset_index(drop = True)

# Shuffle the data
from sklearn.utils import shuffle
data = shuffle(data)
data = data.reset_index(drop=True)

# Removing the date as we would not use it for detection
data.drop(["date"],axis=1,inplace=True)

# Removing the title (we will only use the text)
data.drop(["title"],axis=1,inplace=True)

# Convert to lowercase
data['text'] = data['text'].apply(lambda x: x.lower())

# Remove punctuation

import string

def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

data['text'] = data['text'].apply(punctuation_removal)

# Removing stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Tokenization
from nltk import tokenize

token_space = tokenize.WhitespaceTokenizer()

### Decision Tree



In [None]:
from sklearn.tree import DecisionTreeClassifier

# Vectorizing and applying TF-IDF
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = 20, 
                                           splitter='best', 
                                           random_state=42))])
# Fitting the model
model = pipe.fit(X_train, y_train)

# Accuracy
prediction = model.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
dct['Decision Tree'] = round(accuracy_score(y_test, prediction)*100,2)

cm = metrics.confusion_matrix(y_test, prediction)
plot_confusion_matrix(cm, classes=['Fake', 'Real'])

import pickle 

# Save to file in the current working directory
pkl_filename = "svm_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

### Random Forest