In [108]:
import json
import pandas as pd
import numpy as np # linear algebra
import os
import re
import emoji
import string
import nltk

In [74]:
def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [75]:
webpage_data = load_jsonl('data/trainingData/coronavirus-tweet-id-2020-06-01-00.jsonl')
db_data = []
db_cols = ['created_at', 'full_text', 'lang']
for d in webpage_data:
    db_data.append([])
    for col in db_cols:
        db_data[-1].append(d.get(col, float('nan')))
        
df = pd.DataFrame(db_data, columns=db_cols)



Loaded 36614 records from data/trainingData/coronavirus-tweet-id-2020-06-01-00.jsonl


## EDA

In [76]:
df.head()

Unnamed: 0,created_at,full_text,lang
0,Mon Jun 01 00:00:00 +0000 2020,RT @LissomeLight: Oh my god these people reall...,en
1,Mon Jun 01 00:00:00 +0000 2020,"RT @costareports: To some of the senators, Tru...",en
2,Mon Jun 01 00:00:00 +0000 2020,RT @AngrierWHStaff: The venn diagram of the pe...,en
3,Mon Jun 01 00:00:00 +0000 2020,RT @ArashKolahi: This country was infinity mor...,en
4,Mon Jun 01 00:00:00 +0000 2020,I SCREAMED 😂😂 https://t.co/CuIPpcFTTr,en


In [77]:
df.describe()

Unnamed: 0,created_at,full_text,lang
count,36614,36614,36614
unique,837,16924,47
top,Mon Jun 01 00:00:00 +0000 2020,RT @HelpfulKraken: They can't give you money t...,en
freq,72,1663,25334


In [78]:
def missing_value_of_data(data):
    total=data.isnull().sum().sort_values(ascending=False)
    percentage=round(total/data.shape[0]*100,2)
    return pd.concat([total,percentage],axis=1,keys=['Total','Percentage'])

In [79]:
missing_value_of_data(df)

Unnamed: 0,Total,Percentage
lang,0,0.0
full_text,0,0.0
created_at,0,0.0


In [80]:
def count_values_in_column(data,feature):
    total=data.loc[:,feature].value_counts(dropna=False)
    percentage=round(data.loc[:,feature].value_counts(dropna=False,normalize=True)*100,2)
    return pd.concat([total,percentage],axis=1,keys=['Total','Percentage'])

In [81]:
count_values_in_column(df,'full_text')

Unnamed: 0,Total,Percentage
"RT @HelpfulKraken: They can't give you money to survive a pandemic, they can't find schools, they can't fix water pipes but they can buy ev…",1663,4.54
RT @LEXartistes: Attending protests is not for everyone and that's ok. Whether you are continuing to quarantine in mid-pandemic or you simp…,816,2.23
RT @kissychalamets: la is closing covid-19 testing centers to punish protesters..... they're weaponizing this pandemic in an attempt to sil…,566,1.55
RT @brs2167: She got a walker. Can barely raise her voice. In the hot sun. AT 74. In the middle of a pandemic that threatens her more than…,520,1.42
RT @trythefish: Huntington Beach 2 weeks ago: Protest against stay-at-home order and to open the beaches on the left.…,457,1.25
...,...,...
RT @Antena2RCN: Colombia se acerca a los 30mil contagios y 1000 muertos por covid-19 https://t.co/Gfz0BqJ9Uc,1,0.00
RT @Lenin: ¡Las decisiones que tomamos son responsables y oportunas!Así lo reconoce la comunidad internacional. Recibimos el respaldo de @F…,1,0.00
"RT @critlegthinking: ... featuring, inter alia, @maariaris",1,0.00
"What happened to the 60s,70s peaceful protesters? Its’s all about the leadership and his Rhetoric promoting these horrific events and pandemic of ugliness in our Nation...",1,0.00


In [82]:
def unique_values_in_column(data,feature):
    unique_val=pd.Series(data.loc[:,feature].unique())
    return pd.concat([unique_val],axis=1,keys=['Unique Values'])

In [83]:
unique_values_in_column(df,'full_text')

Unnamed: 0,Unique Values
0,RT @LissomeLight: Oh my god these people reall...
1,"RT @costareports: To some of the senators, Tru..."
2,RT @AngrierWHStaff: The venn diagram of the pe...
3,RT @ArashKolahi: This country was infinity mor...
4,I SCREAMED 😂😂 https://t.co/CuIPpcFTTr
...,...
16919,“The lesson from this disaster is that the leg...
16920,RT @Debora_D_Diniz: Ministra Damares faz chaco...
16921,interesting how just last week we were calling...
16922,"Por um momento geral esqueceu do covid, fogo n..."


In [84]:
def duplicated_values_data(data):
    dup=[]
    columns=data.columns
    for i in data.columns:
        dup.append(sum(data[i].duplicated()))
    return pd.concat([pd.Series(columns),pd.Series(dup)],axis=1,keys=['Columns','Duplicate count'])

In [85]:
duplicated_values_data(df)

Unnamed: 0,Columns,Duplicate count
0,created_at,35777
1,full_text,19690
2,lang,36567


## URL 

In [86]:
 def find_url(string): 
    text = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',string)
    return "".join(text) # converting return value from list to string

In [87]:
df['url']=df['full_text'].apply(lambda x:find_url(x))

In [88]:
df.head()

Unnamed: 0,created_at,full_text,lang,url
0,Mon Jun 01 00:00:00 +0000 2020,RT @LissomeLight: Oh my god these people reall...,en,
1,Mon Jun 01 00:00:00 +0000 2020,"RT @costareports: To some of the senators, Tru...",en,
2,Mon Jun 01 00:00:00 +0000 2020,RT @AngrierWHStaff: The venn diagram of the pe...,en,
3,Mon Jun 01 00:00:00 +0000 2020,RT @ArashKolahi: This country was infinity mor...,en,
4,Mon Jun 01 00:00:00 +0000 2020,I SCREAMED 😂😂 https://t.co/CuIPpcFTTr,en,https://t.co/CuIPpcFTTr


## Emoticon

In [89]:
def find_emoji(text):
    emo_text=emoji.demojize(text)
    line=re.findall(r'\:(.*?)\:',emo_text)
    return line

In [90]:
df['emoji']=df['full_text'].apply(lambda x: find_emoji(x))

In [91]:
df.head()

Unnamed: 0,created_at,full_text,lang,url,emoji
0,Mon Jun 01 00:00:00 +0000 2020,RT @LissomeLight: Oh my god these people reall...,en,,[]
1,Mon Jun 01 00:00:00 +0000 2020,"RT @costareports: To some of the senators, Tru...",en,,[]
2,Mon Jun 01 00:00:00 +0000 2020,RT @AngrierWHStaff: The venn diagram of the pe...,en,,[]
3,Mon Jun 01 00:00:00 +0000 2020,RT @ArashKolahi: This country was infinity mor...,en,,[]
4,Mon Jun 01 00:00:00 +0000 2020,I SCREAMED 😂😂 https://t.co/CuIPpcFTTr,en,https://t.co/CuIPpcFTTr,"[face_with_tears_of_joy, face_with_tears_of_joy]"


In [92]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [93]:
df['full_text']=df['full_text'].apply(lambda x: remove_emoji(x))

## Email

In [94]:
def find_email(text):
    line = re.findall(r'[\w\.-]+@[\w\.-]+',str(text))
    return ",".join(line)

In [95]:
df['email']=df['full_text'].apply(lambda x: find_email(x))

In [96]:
df.head()

Unnamed: 0,created_at,full_text,lang,url,emoji,email
0,Mon Jun 01 00:00:00 +0000 2020,RT @LissomeLight: Oh my god these people reall...,en,,[],
1,Mon Jun 01 00:00:00 +0000 2020,"RT @costareports: To some of the senators, Tru...",en,,[],
2,Mon Jun 01 00:00:00 +0000 2020,RT @AngrierWHStaff: The venn diagram of the pe...,en,,[],
3,Mon Jun 01 00:00:00 +0000 2020,RT @ArashKolahi: This country was infinity mor...,en,,[],
4,Mon Jun 01 00:00:00 +0000 2020,I SCREAMED https://t.co/CuIPpcFTTr,en,https://t.co/CuIPpcFTTr,"[face_with_tears_of_joy, face_with_tears_of_joy]",


## Hashtags

In [97]:
def find_hash(text):
    line=re.findall(r'(?<=#)\w+',text)
    return " ".join(line)

In [98]:
df['hash']=df['full_text'].apply(lambda x: find_hash(x))
df.head()

Unnamed: 0,created_at,full_text,lang,url,emoji,email,hash
0,Mon Jun 01 00:00:00 +0000 2020,RT @LissomeLight: Oh my god these people reall...,en,,[],,
1,Mon Jun 01 00:00:00 +0000 2020,"RT @costareports: To some of the senators, Tru...",en,,[],,
2,Mon Jun 01 00:00:00 +0000 2020,RT @AngrierWHStaff: The venn diagram of the pe...,en,,[],,
3,Mon Jun 01 00:00:00 +0000 2020,RT @ArashKolahi: This country was infinity mor...,en,,[],,
4,Mon Jun 01 00:00:00 +0000 2020,I SCREAMED https://t.co/CuIPpcFTTr,en,https://t.co/CuIPpcFTTr,"[face_with_tears_of_joy, face_with_tears_of_joy]",,


## Mentions

In [99]:
def find_at(text):
    line=re.findall(r'(?<=@)\w+',text)
    return " ".join(line)

In [100]:
df['at_mention']=df['full_text'].apply(lambda x: find_at(x))

In [101]:
df.head()

Unnamed: 0,created_at,full_text,lang,url,emoji,email,hash,at_mention
0,Mon Jun 01 00:00:00 +0000 2020,RT @LissomeLight: Oh my god these people reall...,en,,[],,,LissomeLight
1,Mon Jun 01 00:00:00 +0000 2020,"RT @costareports: To some of the senators, Tru...",en,,[],,,costareports
2,Mon Jun 01 00:00:00 +0000 2020,RT @AngrierWHStaff: The venn diagram of the pe...,en,,[],,,AngrierWHStaff
3,Mon Jun 01 00:00:00 +0000 2020,RT @ArashKolahi: This country was infinity mor...,en,,[],,,ArashKolahi
4,Mon Jun 01 00:00:00 +0000 2020,I SCREAMED https://t.co/CuIPpcFTTr,en,https://t.co/CuIPpcFTTr,"[face_with_tears_of_joy, face_with_tears_of_joy]",,,


## Cleaning the corpus

In [102]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub(r'(?<=@)\w+', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [103]:
df['full_text'] = df['full_text'].apply(lambda x:clean_text(x))

In [104]:
df.head()

Unnamed: 0,created_at,full_text,lang,url,emoji,email,hash,at_mention
0,Mon Jun 01 00:00:00 +0000 2020,rt oh my god these people really went from “m...,en,,[],,,LissomeLight
1,Mon Jun 01 00:00:00 +0000 2020,rt to some of the senators trump seemed obses...,en,,[],,,costareports
2,Mon Jun 01 00:00:00 +0000 2020,rt the venn diagram of the people saying “jus...,en,,[],,,AngrierWHStaff
3,Mon Jun 01 00:00:00 +0000 2020,rt this country was infinity more prepared to...,en,,[],,,ArashKolahi
4,Mon Jun 01 00:00:00 +0000 2020,i screamed,en,https://t.co/CuIPpcFTTr,"[face_with_tears_of_joy, face_with_tears_of_joy]",,,


In [106]:
df['full_text'] = df['full_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [107]:
df.head()

Unnamed: 0,created_at,full_text,lang,url,emoji,email,hash,at_mention
0,Mon Jun 01 00:00:00 +0000 2020,these people really went from “making stay hom...,en,,[],,,LissomeLight
1,Mon Jun 01 00:00:00 +0000 2020,some senators trump seemed obsessed with savin...,en,,[],,,costareports
2,Mon Jun 01 00:00:00 +0000 2020,venn diagram people saying “just comply with p...,en,,[],,,AngrierWHStaff
3,Mon Jun 01 00:00:00 +0000 2020,this country infinity more prepared against pe...,en,,[],,,ArashKolahi
4,Mon Jun 01 00:00:00 +0000 2020,screamed,en,https://t.co/CuIPpcFTTr,"[face_with_tears_of_joy, face_with_tears_of_joy]",,,


## tokenization

In [109]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

df['tokenized'] = df['full_text'].apply(lambda x: tokenization(x.lower()))
df.head()

Unnamed: 0,created_at,full_text,lang,url,emoji,email,hash,at_mention,tokenized
0,Mon Jun 01 00:00:00 +0000 2020,these people really went from “making stay hom...,en,,[],,,LissomeLight,"[these, people, really, went, from, making, st..."
1,Mon Jun 01 00:00:00 +0000 2020,some senators trump seemed obsessed with savin...,en,,[],,,costareports,"[some, senators, trump, seemed, obsessed, with..."
2,Mon Jun 01 00:00:00 +0000 2020,venn diagram people saying “just comply with p...,en,,[],,,AngrierWHStaff,"[venn, diagram, people, saying, just, comply, ..."
3,Mon Jun 01 00:00:00 +0000 2020,this country infinity more prepared against pe...,en,,[],,,ArashKolahi,"[this, country, infinity, more, prepared, agai..."
4,Mon Jun 01 00:00:00 +0000 2020,screamed,en,https://t.co/CuIPpcFTTr,"[face_with_tears_of_joy, face_with_tears_of_joy]",,,,[screamed]


## remove stop words

In [111]:
nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srjhanwa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [112]:
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text
    
df['stopwords_removed'] = df['tokenized'].apply(lambda x: remove_stopwords(x))
df.head(10)

Unnamed: 0,created_at,full_text,lang,url,emoji,email,hash,at_mention,tokenized,stopwords_removed
0,Mon Jun 01 00:00:00 +0000 2020,these people really went from “making stay hom...,en,,[],,,LissomeLight,"[these, people, really, went, from, making, st...","[people, really, went, making, stay, home, tyr..."
1,Mon Jun 01 00:00:00 +0000 2020,some senators trump seemed obsessed with savin...,en,,[],,,costareports,"[some, senators, trump, seemed, obsessed, with...","[senators, trump, seemed, obsessed, saving, li..."
2,Mon Jun 01 00:00:00 +0000 2020,venn diagram people saying “just comply with p...,en,,[],,,AngrierWHStaff,"[venn, diagram, people, saying, just, comply, ...","[venn, diagram, people, saying, comply, police..."
3,Mon Jun 01 00:00:00 +0000 2020,this country infinity more prepared against pe...,en,,[],,,ArashKolahi,"[this, country, infinity, more, prepared, agai...","[country, infinity, prepared, people, defend, ..."
4,Mon Jun 01 00:00:00 +0000 2020,screamed,en,https://t.co/CuIPpcFTTr,"[face_with_tears_of_joy, face_with_tears_of_joy]",,,,[screamed],[screamed]
5,Mon Jun 01 00:00:00 +0000 2020,continua envergonhando história diplomacia bra...,pt,https://t.co/2LdPbIpwMs,[],,,,"[continua, envergonhando, história, diplomacia...","[continua, envergonhando, história, diplomacia..."
6,Mon Jun 01 00:00:00 +0000 2020,icymi clintons have killed more than coronavirus,en,https://t.co/00dDDRIs4o,[],,,blaze0497,"[icymi, clintons, have, killed, more, than, co...","[icymi, clintons, killed, coronavirus]"
7,Mon Jun 01 00:00:00 +0000 2020,they cant give money survive pandemic they can...,en,,[],,,HelpfulKraken,"[they, cant, give, money, survive, pandemic, t...","[cant, give, money, survive, pandemic, cant, f..."
8,Mon Jun 01 00:00:00 +0000 2020,unique like purple within purple state watch w...,en,,[],,,KcNightfire,"[unique, like, purple, within, purple, state, ...","[unique, like, purple, within, purple, state, ..."
9,Mon Jun 01 00:00:00 +0000 2020,covid socialdistancing rulesdon’t stand close ...,en,,[],,,RichLowry,"[covid, socialdistancing, rulesdon, t, stand, ...","[covid, socialdistancing, rulesdon, stand, clo..."


## Stemming and Lemmatization

In [113]:
ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

df['stemmed'] = df['stopwords_removed'].apply(lambda x: stemming(x))
df.head()

Unnamed: 0,created_at,full_text,lang,url,emoji,email,hash,at_mention,tokenized,stopwords_removed,stemmed
0,Mon Jun 01 00:00:00 +0000 2020,these people really went from “making stay hom...,en,,[],,,LissomeLight,"[these, people, really, went, from, making, st...","[people, really, went, making, stay, home, tyr...","[peopl, realli, went, make, stay, home, tyrann..."
1,Mon Jun 01 00:00:00 +0000 2020,some senators trump seemed obsessed with savin...,en,,[],,,costareports,"[some, senators, trump, seemed, obsessed, with...","[senators, trump, seemed, obsessed, saving, li...","[senat, trump, seem, obsess, save, live, appea..."
2,Mon Jun 01 00:00:00 +0000 2020,venn diagram people saying “just comply with p...,en,,[],,,AngrierWHStaff,"[venn, diagram, people, saying, just, comply, ...","[venn, diagram, people, saying, comply, police...","[venn, diagram, peopl, say, compli, polic, sta..."
3,Mon Jun 01 00:00:00 +0000 2020,this country infinity more prepared against pe...,en,,[],,,ArashKolahi,"[this, country, infinity, more, prepared, agai...","[country, infinity, prepared, people, defend, ...","[countri, infin, prepar, peopl, defend, peopl,..."
4,Mon Jun 01 00:00:00 +0000 2020,screamed,en,https://t.co/CuIPpcFTTr,"[face_with_tears_of_joy, face_with_tears_of_joy]",,,,[screamed],[screamed],[scream]


In [115]:
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text

df['lemmatized'] = df['stemmed'].apply(lambda x: lemmatizer(x))
df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\srjhanwa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


Unnamed: 0,created_at,full_text,lang,url,emoji,email,hash,at_mention,tokenized,stopwords_removed,stemmed,lemmatized
0,Mon Jun 01 00:00:00 +0000 2020,these people really went from “making stay hom...,en,,[],,,LissomeLight,"[these, people, really, went, from, making, st...","[people, really, went, making, stay, home, tyr...","[peopl, realli, went, make, stay, home, tyrann...","[peopl, realli, went, make, stay, home, tyrann..."
1,Mon Jun 01 00:00:00 +0000 2020,some senators trump seemed obsessed with savin...,en,,[],,,costareports,"[some, senators, trump, seemed, obsessed, with...","[senators, trump, seemed, obsessed, saving, li...","[senat, trump, seem, obsess, save, live, appea...","[senat, trump, seem, ob, save, live, appear, p..."
2,Mon Jun 01 00:00:00 +0000 2020,venn diagram people saying “just comply with p...,en,,[],,,AngrierWHStaff,"[venn, diagram, people, saying, just, comply, ...","[venn, diagram, people, saying, comply, police...","[venn, diagram, peopl, say, compli, polic, sta...","[venn, diagram, peopl, say, compli, polic, sta..."
3,Mon Jun 01 00:00:00 +0000 2020,this country infinity more prepared against pe...,en,,[],,,ArashKolahi,"[this, country, infinity, more, prepared, agai...","[country, infinity, prepared, people, defend, ...","[countri, infin, prepar, peopl, defend, peopl,...","[countri, infin, prepar, peopl, defend, peopl,..."
4,Mon Jun 01 00:00:00 +0000 2020,screamed,en,https://t.co/CuIPpcFTTr,"[face_with_tears_of_joy, face_with_tears_of_joy]",,,,[screamed],[screamed],[scream],[scream]
