<a href="https://colab.research.google.com/github/Pranjalya/fake-news-twitter/blob/master/Fake_News_Countering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using Spark

Installing dependencies (JAVA, Hadoop and Spark)

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q findspark
!pip install -q pyspark
!pip install -q tweet-preprocessor

Creating Spark session using PySpark

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

## Loading data

In [27]:
import pandas as pd
import numpy as np
import os
import re
# Tweet preprocessor - https://github.com/s/preprocessor
import preprocessor as tp
import nltk
import string
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
path = '/content/drive/My Drive/twitter_dataset/'
first = True

for f in tqdm(os.listdir(path)):
    if(first):
        df = pd.read_json(path + f, compression='gzip', lines=True)
        first = True
    else:
        df = pd.concat([df, pd.read_json(path + f, compression='gzip', lines=True)])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15686 entries, 0 to 15685
Data columns (total 33 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   created_at                 15686 non-null  datetime64[ns, UTC]
 1   id                         15686 non-null  int64              
 2   id_str                     15686 non-null  int64              
 3   full_text                  15686 non-null  object             
 4   truncated                  15686 non-null  bool               
 5   display_text_range         15686 non-null  object             
 6   entities                   15686 non-null  object             
 7   source                     15686 non-null  object             
 8   in_reply_to_status_id      839 non-null    float64            
 9   in_reply_to_status_id_str  839 non-null    float64            
 10  in_reply_to_user_id        882 non-null    float64            
 11  in

In [16]:
df.head()

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,user,geo,coordinates,place,contributors,retweeted_status,is_quote_status,retweet_count,favorite_count,favorited,retweeted,lang,extended_entities,possibly_sensitive,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_status,withheld_scope,withheld_in_countries
0,2020-01-23 18:59:28+00:00,1220420803336048642,1220420803336048640,RT @ConflictsW: Roads in Wuhan on lockdown and...,False,"[0, 140]","{'hashtags': [{'text': 'China', 'indices': [99...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 12537532, 'id_str': '12537532', 'name':...",,,,,{'created_at': 'Wed Jan 22 19:44:32 +0000 2020...,False,7443,0,False,False,en,,,,,,,,
1,2020-01-23 18:59:28+00:00,1220420805189931009,1220420805189931008,RT @BNODesk: Here's a current list of the coro...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,,"{'id': 624000012, 'id_str': '624000012', 'name...",,,,,{'created_at': 'Thu Jan 23 18:02:45 +0000 2020...,False,353,0,False,False,en,,,,,,,,
2,2020-01-23 18:59:29+00:00,1220420806104289286,1220420806104289280,RT @ScottPresler: Is anyone else following the...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 39666288, 'id_str': '39666288', 'name':...",,,,,{'created_at': 'Thu Jan 23 18:32:48 +0000 2020...,False,1241,0,False,False,en,,,,,,,,
3,2020-01-23 18:59:29+00:00,1220420808985718785,1220420808985718784,RT @JackPosobiec: Huge \n\nWas just told by a ...,False,"[0, 139]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 1080984630331957249, 'id_str': '1080984...",,,,,{'created_at': 'Thu Jan 23 18:22:14 +0000 2020...,False,1161,0,False,False,en,,,,,,,,
4,2020-01-23 18:59:30+00:00,1220420812123115520,1220420812123115520,RT @inteldotwav: Helpful diagram by the WHO sh...,False,"[0, 123]","{'hashtags': [{'text': 'coronavirus', 'indices...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,,"{'id': 43403644, 'id_str': '43403644', 'name':...",,,,,{'created_at': 'Thu Jan 23 18:52:04 +0000 2020...,False,141,0,False,False,en,"{'media': [{'id': 1220418936216412160, 'id_str...",0.0,,,,,,


## Data Preprocessing

Getting all the list of emoticons and other pictorials first.

In [0]:
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

# All emoticons
emoticons = emoticons_happy.union(emoticons_sad)

Performing basic preprocessings.

In [0]:
def clean_tweets(tweet):
 
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(tweet)

    # Removing retweets and 
    tweet = re.sub(r'RT', '', tweet)
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'‚Ä¶', '', tweet)

    # Replace=ing consecutive non-ASCII characters with a space
    tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)

    # Removing emoticons from tweet
    tweet = emoji_pattern.sub(r'', tweet)

    # Filtering using NLTK library append it to a string
    filtered_tweet = [w for w in word_tokens if not w in stop_words]
    filtered_tweet = []

    for w in word_tokens:
        if w not in stop_words and w not in emoticons and w not in string.punctuation:
            filtered_tweet.append(w)
    
    return ' '.join(filtered_tweet)

Cleaning all the tweets.

In [0]:
# Cleaning using tweet-preprocessor tool first
df['full_text'] = df['full_text'].apply(tp.clean)

# Performing some extra custom cleaning steps
df['full_text'] = df['full_text'].apply(clean_tweets)

In [33]:
df.head()

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,user,geo,coordinates,place,contributors,retweeted_status,is_quote_status,retweet_count,favorite_count,favorited,retweeted,lang,extended_entities,possibly_sensitive,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_status,withheld_scope,withheld_in_countries
0,2020-01-23 18:59:28+00:00,1220420803336048642,1220420803336048640,Roads Wuhan lockdown blocked police city quara...,False,"[0, 140]","{'hashtags': [{'text': 'China', 'indices': [99...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 12537532, 'id_str': '12537532', 'name':...",,,,,{'created_at': 'Wed Jan 22 19:44:32 +0000 2020...,False,7443,0,False,False,en,,,,,,,,
1,2020-01-23 18:59:28+00:00,1220420805189931009,1220420805189931008,Here 's current list coronavirus lockdowns Wuh...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,,"{'id': 624000012, 'id_str': '624000012', 'name...",,,,,{'created_at': 'Thu Jan 23 18:02:45 +0000 2020...,False,353,0,False,False,en,,,,,,,,
2,2020-01-23 18:59:29+00:00,1220420806104289286,1220420806104289280,Is anyone else following Coronavirus outbreak ...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 39666288, 'id_str': '39666288', 'name':...",,,,,{'created_at': 'Thu Jan 23 18:32:48 +0000 2020...,False,1241,0,False,False,en,,,,,,,,
3,2020-01-23 18:59:29+00:00,1220420808985718785,1220420808985718784,Huge Was told Chinese reporter CCP authorities...,False,"[0, 139]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 1080984630331957249, 'id_str': '1080984...",,,,,{'created_at': 'Thu Jan 23 18:22:14 +0000 2020...,False,1161,0,False,False,en,,,,,,,,
4,2020-01-23 18:59:30+00:00,1220420812123115520,1220420812123115520,Helpful diagram WHO showing various symptoms W...,False,"[0, 123]","{'hashtags': [{'text': 'coronavirus', 'indices...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,,"{'id': 43403644, 'id_str': '43403644', 'name':...",,,,,{'created_at': 'Thu Jan 23 18:52:04 +0000 2020...,False,141,0,False,False,en,"{'media': [{'id': 1220418936216412160, 'id_str...",0.0,,,,,,
