In [None]:
import pandas as pd

def read_data(file_path):
    print("Opening dataset: ", file_path)
    df = pd.read_csv(file_path)
    print("Dataframe shape: ", df.shape)
    print("Dataframe columns: ", df.columns, "\n")
    return df

In [None]:
#Open CSV files
true_data = read_data("Datasets/True.csv")
true_data['label'] = 1 # Add a label column to the true data

false_data = read_data("Datasets/Fake.csv")
false_data['label'] = 0 # Add a label column to the fake data

In [None]:
#Check if there is any null value
print("True data null values: ", true_data.isnull().sum())

#We actually don't need the date column for the classification
true_data.drop(columns=['date'])

In [None]:
#Check if there is any null value
print("False data null values: ", false_data.isnull().sum())

#The same goes for the false data
false_data.drop(columns=['date'])

In [None]:
#Merge the two datasets
dataset = pd.concat([true_data, false_data], ignore_index=True)
print("Dataset merged, resulting shape: ", dataset.shape)

assert dataset.shape[0] == true_data.shape[0] + false_data.shape[0]
assert dataset.shape[1] == true_data.shape[1] == false_data.shape[1]

In [None]:
#Importing CountVectorizer to implement Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [None]:
#test train data split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(dataset.text,dataset.label,test_size=.2,random_state=1)

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

In [None]:
#Fitting the classifier to the training data
cv_1=cv.fit_transform(X_train)
print(cv_1.shape)

In [None]:
#Using Random Forest to predict test values
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(cv_1,y_train)
y_pred_randomForest=rf.predict(cv.transform(X_test))

In [None]:
#Computing the accuracy of the model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred_randomForest))

In [None]:
#Plotting the accuracy of the model
import matplotlib.pyplot as plt

def plot_accuracy(y_test,y_pred_randomForest):
    plt.figure(figsize=(5,5))
    plt.pie([accuracy_score(y_test,y_pred_randomForest),1-accuracy_score(y_test,y_pred_randomForest)],labels=['Accuracy','Error'],autopct='%1.3f%%')
    plt.show()

plot_accuracy(y_test,y_pred_randomForest)

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

def plot_confusion_matrix(y_test,y_pred_randomForest):
    plt.figure(figsize=(5,5))
    cm=confusion_matrix(y_test,y_pred_randomForest)
    sns.heatmap(cm,annot=True,fmt='d')
    plt.xlabel('Predicted')
    plt.ylabel('Truth')
    plt.show()

plot_confusion_matrix(y_test,y_pred_randomForest)

In [None]:
#Calculate the false positive rate, true positive rate
from sklearn.metrics import roc_curve

def plot_roc_curve(y_test,y_pred_randomForest):
    fpr,tpr,_=roc_curve(y_test,y_pred_randomForest)
    plt.figure(figsize=(5,5))
    plt.plot(fpr,tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()

plot_roc_curve(y_test,y_pred_randomForest)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

count_vectorizer=CountVectorizer()

knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(cv_1,y_train)
y_pred_knn=knn.predict(cv.transform(X_test))

print(accuracy_score(y_test,y_pred_knn))

In [None]:
#Input a news article to test the model

def test_news_input(text, classifier):
    news_cv = cv.transform([text])

    if classifier.predict(news_cv):
        print("The news is true")
        return 1
    else:
        print("The news is fake")
        return 0

test_news_input("Donald Trump", rf)