In [None]:
"""
We will first import all the libraries and functions we will need
"""

In [1]:
import nltk                                
import re                                 
import string                             
import numpy as np
import pandas as pd                         
from nltk.corpus import stopwords          
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer   
from os.path import exists
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


In [None]:
"""
This function will clean all the text data in the pandas data frame that is passed to it 
"""

In [2]:
def clean(unclean): 
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, # Initializing the tokenizer from nltk
                                reduce_len=True)
    stopwords_english = stopwords.words('english') #importing english stopwords 
    stemmer = PorterStemmer() #Initializing the stemmer class 
    clean_text = []
    clean_stem_text = [] 

    # remove stock market tickers like $GE
    unclean = re.sub(r'\$\w*', '', unclean)
    # Remove old style retweet text "RT"
    unclean = re.sub(r'^RT[\s]+', '', unclean)
    # Remove \n characters 
    unclean = unclean.strip()
    # Remove hyperlinks
    unclean = re.sub(r'https?:\/\/.*[\r\n]*', '', unclean)
    # Remove hashtags
    # only removing the hash # sign from the word
    unclean = re.sub(r'#', '', unclean)
    # Tokenizing
    unclean = tokenizer.tokenize(unclean)
    for word in unclean: # Go through every word in your tokens list
        if (word not in stopwords_english and  word not in string.punctuation):  # remove punctuation and stopwords
            clean_text.append(word)

    for word in clean_text:
        stem_word = stemmer.stem(word)  # stemming word
        clean_stem_text.append(stem_word)  # append to the list

    return clean_stem_text 

In [None]:
"""
In this function we check that there is no cleaned data already otherwise 
we just use it and avoid wasting time and cleaning it again 
"""

In [3]:
def clean_text():
    if exists('clean_text_news.csv'): 
        return  pd.read_csv(r'clean_text_news.csv')
    else: 
        df = pd.read_csv(r'news.csv')
        df['text']=df['text'].apply(clean)
        df['title']=df['title'].apply(clean)
        df.to_csv('clean_text_news.csv', index=False)
        return df

In [None]:
"""
This is the Naïve Bayes Classifier model training testing and evaluation function
"""

In [4]:
def multinomial (X_train_tf,X_test_tf, y_test, y_train): 
    mnb = MultinomialNB()
    mnb.fit(X_train_tf, y_train)
    y_pred = mnb.predict(X_test_tf)
    mnb_score = accuracy_score(y_test, y_pred) 
    print("Naïve Bayes Classifier")
    print("Accuracy score is: ",mnb_score)
    print(classification_report(y_test, y_pred))
    print("----------------------------------------------------------------------")
    return 0 

In [None]:
"""
This is the logistic regression model training testing and evaluation function
"""

In [5]:
def logistic_regression(X_train_tf,X_test_tf, y_test, y_train):
    lr = LogisticRegression()
    lr.fit(X_train_tf, y_train)
    y_pred = lr.predict(X_test_tf)
    score = lr.score(X_test_tf, y_test)
    print("Logistic regression")
    print("Accuracy score is: ",score)
    print(classification_report(y_test, y_pred))
    print("----------------------------------------------------------------------")
    return 0 


In [None]:
"""
This is the Support Vector Machines model training testing and evaluation function
"""

In [6]:
def support_vector_machines(X_train_tf,X_test_tf, y_test, y_train):
    clf = svm.SVC(kernel='linear') # Linear Kernel
    clf.fit(X_train_tf, y_train)
    y_pred = clf.predict(X_test_tf)
    score = clf.score(X_test_tf, y_test)
    print("Support Vector Machines")
    print("Accuracy score is: ",score)
    print(classification_report(y_test, y_pred))
    print("----------------------------------------------------------------------")
    return 0 

In [None]:
"""
This is the "K-nearest-neighbors model training testing and evaluation function with k = 5
"""

In [7]:
def k_neighbour(X_train_tf,X_test_tf, y_test, y_train): 
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train_tf, y_train)
    y_pred = knn.predict(X_test_tf)
    score = knn.score(X_test_tf, y_test)
    print("K-nearest-neighbors with K = 5")
    print("Accuracy score is: ",score)
    print(classification_report(y_test, y_pred))
    print("----------------------------------------------------------------------")
    return 0 

In [None]:
"""
This is the Main function, we first split the data,  80% will be training while the remaining 20% will be the test data.
The data will be randomized so each iterazion gives a different split with the same percentages.
We then perform Features Extraction and test each of our model with the resulting train and test data we have.
"""

In [8]:
df = clean_text()
x = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
tf_vectorizer = TfidfVectorizer(use_idf=True) 
X_train_tf = tf_vectorizer.fit_transform(X_train)
X_test_tf = tf_vectorizer.transform(X_test)
multinomial(X_train_tf,X_test_tf, y_test, y_train)
logistic_regression(X_train_tf,X_test_tf, y_test, y_train)
support_vector_machines(X_train_tf,X_test_tf, y_test, y_train)
k_neighbour(X_train_tf,X_test_tf, y_test, y_train)

Naïve Bayes Classifier
Accuracy score is:  0.8366219415943172
              precision    recall  f1-score   support

        FAKE       0.98      0.69      0.81       632
        REAL       0.76      0.99      0.86       635

    accuracy                           0.84      1267
   macro avg       0.87      0.84      0.83      1267
weighted avg       0.87      0.84      0.83      1267

----------------------------------------------------------------------
Logistic regression
Accuracy score is:  0.9226519337016574
              precision    recall  f1-score   support

        FAKE       0.90      0.95      0.92       632
        REAL       0.95      0.90      0.92       635

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267

----------------------------------------------------------------------
Support Vector Machines
Accuracy score is:  0.9384372533543804
              precis

0