# Twitter fake news detection model 

In this project, we will develop fake news detection model using the pre-labeling fake news dataset provided by Polifact, a third-party non-profit organization.

By the dataset provided, we have two features: **news url** and **news title**, we will analyze the text from those two features to try to predict whether a twitter news is fake or not.

# Import and load data

In [1]:
#import package
import pyspark.sql.functions as F
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark import SparkConf, SparkContext
from itertools import chain
from pyspark.sql.window import Window
from pyspark.sql.types import MapType, StringType, IntegerType
from collections import Counter
from pyspark.sql import Window
from pyspark.sql.functions import array_contains
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import SQLTransformer, RegexTokenizer, StopWordsRemover, CountVectorizer, Imputer
from pyspark.ml.classification import RandomForestClassifier,LogisticRegression
import tweepy
import time
from kafka import KafkaConsumer,KafkaProducer

In [2]:
# initializing spark session
sc=SparkContext(appName='PySparkShell')
spark=SparkSession(sc)

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [4]:
# Read file 
# to read parquet file
data=spark.read.csv('combined_polifact.csv',inferSchema=True, header = True)
print(type(data))
data.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+---------------+--------------------+--------------------+-----+
|             id|            news_url|               title|label|
+---------------+--------------------+--------------------+-----+
|politifact15014|speedtalk.com/for...|BREAKING: First N...|    1|
|politifact15156|politics2020.info...|Court Orders Obam...|    1|
|politifact14745|www.nscdscamps.or...|UPDATE: Second Ro...|    1|
|politifact14355|https://howafrica...|Oscar Pistorius A...|    1|
|politifact15371|http://washington...|Trump Votes For D...|    1|
|politifact14404|gloria.tv/video/y...|Putin says: ?�?Po...|    1|
|politifact13919|http://blogs.tren...|New York Man Want...|    1|
|politifact14795|https://web.archi...|Saudi Arabia to B...|    1|
|politifact14328|https://web.archi...|Malia Obama Fired...|    1|
|politifact13775|http://beforeitsn...|Target to Discont...|    1|
|politifact14678|http://yournewswi...|Youngest World Le...|    1|
|politifact14394|https://web.archi

In [6]:
# inspect for null value 
data = data.filter(data.news_url. isNotNull())

In [7]:
# change the column name for original label
data = data.withColumnRenamed("label", "fake") # change the original dataset label name to others,'label' will be used in later stage

#clean the news_url
new_data = data.withColumn("url_1",regexp_replace(col("news_url"), "(https|http):\/\/(www|web)*\.*", ""))

In [9]:
new_data.show()

+---------------+--------------------+--------------------+----+--------------------+
|             id|            news_url|               title|fake|               url_1|
+---------------+--------------------+--------------------+----+--------------------+
|politifact15014|speedtalk.com/for...|BREAKING: First N...|   1|speedtalk.com/for...|
|politifact15156|politics2020.info...|Court Orders Obam...|   1|politics2020.info...|
|politifact14745|www.nscdscamps.or...|UPDATE: Second Ro...|   1|www.nscdscamps.or...|
|politifact14355|https://howafrica...|Oscar Pistorius A...|   1|howafrica.com/osc...|
|politifact15371|http://washington...|Trump Votes For D...|   1|washingtonsources...|
|politifact14404|gloria.tv/video/y...|Putin says: ?�?Po...|   1|gloria.tv/video/y...|
|politifact13919|http://blogs.tren...|New York Man Want...|   1|blogs.trendolizer...|
|politifact14795|https://web.archi...|Saudi Arabia to B...|   1|archive.org/web/2...|
|politifact14328|https://web.archi...|Malia Obama Fire

# Machine Learning Pipeline

## Logistic Regression

Build pipeline.

In [12]:
StopWordsRemover.loadDefaultStopWords('english')

# #### For title #####
# 0. Extract tokens from title
tokenizer_title= RegexTokenizer(inputCol= 'url_1', outputCol= 'url_words',pattern= '\\W', toLowercase= True)

# 1. Remove stop words from title
title_sw_remover= StopWordsRemover(inputCol= 'url_words', outputCol= 'url_stop_removed')

# 2. Compute Term frequency from title
title_count_vectorizer= CountVectorizer(inputCol= 'url_stop_removed', outputCol= 'url_tf')

# 3. Compute Term frequency-inverse document frequency from title
title_tfidf= IDF(inputCol= 'url_tf', outputCol= 'tf_idf_url')

#### For title text ####
# 4. Extract tokens from text
text_tokenizer= RegexTokenizer(inputCol= 'title', outputCol= 'text_words',
                                pattern= '\\W', toLowercase= True)

# 5. Remove stop words from text
add_stopwords_title = ["http","https","amp","rt","www"] 
text_sw_remover= StopWordsRemover(inputCol= 'text_words', outputCol= 'text_sw_removed').setStopWords(add_stopwords_title)

# 6. Compute Term frequency from text (BagofWords Count)
text_count_vectorizer= CountVectorizer(inputCol= 'text_sw_removed', outputCol= 'tf_text')

# 7. Compute Term frequency-inverse document frequency text
text_tfidf= IDF(inputCol= 'tf_text', outputCol= 'tf_idf_text')

# 8. StringIndexer subject
labelIndexer= StringIndexer(inputCol= 'fake', outputCol= 'label')

# 9. VectorAssembler
vec_assembler= VectorAssembler(inputCols=['tf_idf_url', 'tf_idf_text'], outputCol= 'features')

# 10 logistic regression
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

# Build the pipeline to deal with document
LRpipeline = Pipeline(stages=[tokenizer_title,
                            title_sw_remover,
                            title_count_vectorizer,
                            title_tfidf,
                            text_tokenizer, 
                            text_sw_remover, 
                            text_count_vectorizer,
                            text_tfidf,
                            vec_assembler,
                            labelIndexer,
                            lr])

# set seed for reproducibility
(trainingData, testData) = new_data.randomSplit([0.7, 0.3], seed = 100)


Fit the training set.

In [13]:
# Fit the pipeline to training documents.
pipelineFit = LRpipeline.fit(trainingData)

Predict and test on the test dataset.

In [14]:
predictions = pipelineFit.transform(testData)

Evaluate.

In [35]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.8435317154291192

In [17]:
# Function for evaluating classification model
from pyspark.ml.evaluation import  MulticlassClassificationEvaluator, BinaryClassificationEvaluator

accuracy= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'accuracy')
f1= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'f1')
areaUnderROC= BinaryClassificationEvaluator(labelCol= 'label', metricName= 'areaUnderROC')
# Since it's both important to avoid misinformation spread and to not incorrectly flag those correct information, we here focus on the f1 and AUC most.

def classification_evaluator(data_result):
    data_result.crosstab(col1= 'prediction', col2= 'label').show()
    print('f1:' ,f1.evaluate(data_result))
    print('areaUnderROC:' ,areaUnderROC.evaluate(data_result))

In [18]:
classification_evaluator(predictions)

+----------------+---+---+
|prediction_label|0.0|1.0|
+----------------+---+---+
|             1.0|  1| 88|
|             0.0|179| 46|
+----------------+---+---+

f1: 0.8435317154291192
areaUnderROC: 0.9737562189054717


The TF-IDF model with Logistic Regressor can give us AUC 0.97, which is a pretty good result.  

## Random Forest Classfier

Built pipeline

In [21]:
# Random Forest Classifier
rf= RandomForestClassifier(featuresCol= 'features', labelCol= 'label', predictionCol= 'prediction', maxDepth= 10, numTrees= 20)

# Build the pipeline to deal with document
RFpipeline = Pipeline(stages=[tokenizer_title,
                            title_sw_remover,
                            title_count_vectorizer,
                            title_tfidf,
                            text_tokenizer, 
                            text_sw_remover, 
                            text_count_vectorizer,
                            text_tfidf,
                            vec_assembler,
                            labelIndexer,
                            rf])

Fit the training set.

In [22]:
# Fit the pipeline to training documents.
RFpipelineFit = RFpipeline.fit(trainingData)

Predict and test on the test dataset.

In [23]:
RFpredictions = RFpipelineFit.transform(testData)

In [24]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.8435317154291192

In [26]:
# Function for evaluating classification model
from pyspark.ml.evaluation import  MulticlassClassificationEvaluator, BinaryClassificationEvaluator

accuracy= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'accuracy')
f1= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'f1')
areaUnderROC= BinaryClassificationEvaluator(labelCol= 'label', metricName= 'areaUnderROC')
# Since it's both important to avoid misinformation spread and to not incorrectly flag those correct information, we here focus on the f1 and AUC most.

def classification_evaluator(data_result):
    data_result.crosstab(col1= 'prediction', col2= 'label').show()
    print('f1:' ,f1.evaluate(data_result))
    print('areaUnderROC:' ,areaUnderROC.evaluate(data_result))

In [28]:
classification_evaluator(RFpredictions)

+----------------+---+---+
|prediction_label|0.0|1.0|
+----------------+---+---+
|             1.0| 10| 70|
|             0.0|170| 64|
+----------------+---+---+

f1: 0.7499670011413724
areaUnderROC: 0.9150704809286897


The Random Forest Classifier provides a inperior result than Logistic Classifier.