In [39]:
import pandas as pd 
raw = pd.read_csv("raw-all-7-17-21.csv")
raw.shape


(105223, 38)

In [40]:
raw.columns

Index(['tweet_id', 'conversation_id', 'created_at', 'date', 'timezone',
       'place', 'tweet', 'language', 'hashtags', 'cashtags', 'user_id',
       'user_id_str', 'username', 'name', 'day', 'hour', 'link', 'urls',
       'photos', 'video', 'thumbnail', 'retweet', 'nlikes', 'nreplies',
       'nretweets', 'quote_url', 'search', 'near', 'geo', 'source',
       'user_rt_id', 'user_rt', 'retweet_id', 'reply_to', 'retweet_date',
       'translate', 'trans_src', 'trans_dest'],
      dtype='object')

In [41]:
#Remove irrelevant features
columns_keep = [ 'date', 'tweet', 'hashtags', 'username',
                'nlikes', 'nreplies', 'nretweets', 'search', 'place', 'language']
raw = raw[columns_keep]
raw

# Drop Irrelevant tweets by detecking keywords in tweets
words = ['Johor', 'Malaysia', 'Sects', 'depression tribe', 'drkshdw', 'sects', 'valimai', 'myanmar']
raw = raw[~raw.tweet.str.contains('|'.join(words))]

# Drop Irrelevant tweets by detecking keywords in hashtags
words = ['july6coup', 'dreadfulcovidamidmilitarycoup', 'whatshappeninginmyanmar', 
         'july3revilestrike', 'july6coup', 'whatshappeninginmyanmar', 'july4coup', 
         'june29coup', 'myanmar', 'june30coup', 'feb9coup', 'feb8coup', 'herethevoiceofmyanmar', 
         'savemyanmar', 'myanmarsnsfreedom', 'coup7feb', 'weneeddemocracy', 'fightfordemocracy', 
         "againstmyanmarmilitarycoup", "ambarazahrah"] 
raw = raw[~raw.hashtags.str.contains('|'.join(words))]
raw.shape

(98674, 10)

In [42]:
# Remove duplicate rows
raw.drop_duplicates(inplace=True) 
raw.drop_duplicates(subset ="tweet", keep = False, inplace = True)
raw.shape

(78378, 10)

In [43]:
def load_data(fp=raw):
    df = fp
    df.date = pd.to_datetime(df.date)
    df['latitude'] = df.place.map(lambda x:  eval(x)['coordinates'][0]
                                  if not pd.isnull(x) else x)
    df['longitude'] = df.place.map(lambda x:  eval(x)['coordinates'][1] 
                                   if not pd.isnull(x) else x)
    df.drop('place', 1, inplace=True)
    return (df.loc[df.language=='en']
              .rename(columns=dict(language='lang', id='tweet_id',
                                   search='search_keyword')))
load_data()

  


Unnamed: 0,date,tweet,hashtags,username,nlikes,nreplies,nretweets,search_keyword,lang,latitude,longitude
1,2019-12-29 13:54:00,The best depression killer is worshipping Jesu...,[],JesusDailyTwits,1,0,0,depression,en,,
2,2019-12-29 12:47:00,I hope with the blessings of the majlis my dep...,[],Aluvrendar,67,3,6,depression,en,,
3,2019-12-29 12:33:00,@kowey It feels like a theory of psychohistory...,[],mengwong,0,2,0,depression,en,,
4,2019-12-28 18:33:00,Post Project Depression. Cant Believe That 12 ...,[],haziqqqaaahzik,10,1,8,depression,en,,
8,2019-12-26 02:16:00,Depression thought ????,[],NASYRANN_,0,1,3,depression,en,,
...,...,...,...,...,...,...,...,...,...,...,...
105217,2021-01-28 07:44:00,STPI Honours Frontline Healthcare and Social W...,[],russelwongphoto,0,0,0,frontline,en,,
105218,2021-01-26 15:02:00,Your Covid care speech drove us to be frontlin...,[],ShaheerBirdieFC,7,1,0,frontline,en,,
105219,2021-01-23 12:41:00,"I know it's still early days, but India seems ...",[],rishabhm,0,0,0,frontline,en,,
105220,2021-01-16 15:50:00,PM gets emotional while speaking about invalua...,[],Erongodath,0,0,0,frontline,en,,


In [44]:
import pandas as pd
import numpy as np
import re
import string 
from stop_words import get_stop_words
from nltk.corpus import stopwords
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)
from contraction_map import CONTRACTION_MAP

In [46]:
# Clean thoroughly 
def remove_hyperlinks(text):
    ptn = r'(https://[\w./-]+)|(www.[\w./-]+)|([\w./-]+.com)'
    return re.sub(ptn, '', text)

def remove_mentions(text):
    ptn = r'(@[\w_]+ | (@[.]+) | (@))'
    return re.sub(ptn, '', text)

def remove_stopwords(text):
    #text = list(text)
    text=' '.join([x for x in text.split() if x not in stop_words])
    return text

def remove_punctuations(text): 
    text = re.sub("[^a-zA-Z]", " ", text) 
    text = text.translate(str.maketrans('', '', string.punctuation))
    #text = text.translate(None, string.punctuation)
    return text 

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
    
def remove_alphabets(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    return text 

def remove_digits(text):
    text = ''.join(i for i in text if not i.isdigit())
    return text

def clean_text(text, hyperlink=True, mention=True, stopwords=True,
               punctuations=True, contractions=True, digits=True,
               lowercase=True, alphabets=True):
    if lowercase: # Transform to lowercase
        text = text.lower()
    if hyperlink: # Remove Hyperlinks
        text = remove_hyperlinks(text)
    if mention: # Remove Mentions
        text = remove_mentions(text)
    if punctuations: # Remove Punctuations 
        text = remove_punctuations(text) 
    if contractions: # Expand Contractions e.g. can't -> cannot
        text = expand_contractions(text)
    if stopwords: # Remove english stopwords
        text = remove_stopwords(text)
    if alphabets: # Remove single alphabets 
        text = remove_alphabets(text)
    if digits: # Remove all numbers
        text = remove_digits(text)
    return text

raw['clean_tweet4'] = raw.tweet.map(clean_text)


In [71]:
raw = raw.loc[raw.language == "en"]

raw["year"] = raw["date"].dt.year
raw["month"] = raw["date"].dt.month
raw

Unnamed: 0,date,tweet,hashtags,username,nlikes,nreplies,nretweets,search,language,latitude,longitude,clean_tweet4,year,month
1,2019-12-29 13:54:00,The best depression killer is worshipping Jesu...,[],JesusDailyTwits,1,0,0,depression,en,,,best depression killer worshipping jesus try v...,2019,12
2,2019-12-29 12:47:00,I hope with the blessings of the majlis my dep...,[],Aluvrendar,67,3,6,depression,en,,,hope blessings majlis depression never return ...,2019,12
3,2019-12-29 12:33:00,@kowey It feels like a theory of psychohistory...,[],mengwong,0,2,0,depression,en,,,feels like theory psychohistory ing together h...,2019,12
4,2019-12-28 18:33:00,Post Project Depression. Cant Believe That 12 ...,[],haziqqqaaahzik,10,1,8,depression,en,,,post project depression cant believe days ende...,2019,12
8,2019-12-26 02:16:00,Depression thought ????,[],NASYRANN_,0,1,3,depression,en,,,depression thought,2019,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105217,2021-01-28 07:44:00,STPI Honours Frontline Healthcare and Social W...,[],russelwongphoto,0,0,0,frontline,en,,,stpi honours frontline healthcare social worke...,2021,1
105218,2021-01-26 15:02:00,Your Covid care speech drove us to be frontlin...,[],ShaheerBirdieFC,7,1,0,frontline,en,,,covid care speech drove us frontline warriors ...,2021,1
105219,2021-01-23 12:41:00,"I know it's still early days, but India seems ...",[],rishabhm,0,0,0,frontline,en,,,know still early days india seems administerin...,2021,1
105220,2021-01-16 15:50:00,PM gets emotional while speaking about invalua...,[],Erongodath,0,0,0,frontline,en,,,pm gets emotional speaking invaluable contribu...,2021,1


In [75]:
clean_data = raw.drop(labels=["latitude", "longitude","language", "tweet"], axis=1)
clean_data.to_csv("final_clean_data.csv")

# Lemmatizaion 

In [83]:
import spacy 
from nltk.tokenize.toktok import ToktokTokenizer
import nltk
#nlp = spacy.load('en_core')
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, entity=True)
tokenizer = ToktokTokenizer()

nlp = spacy.load('en_core_web_sm')


In [84]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

clean_data["parse_tweet"] = clean_data.clean_tweet4.map(lemmatize_text)

In [86]:
clean_data.to_csv("final_lemmatized_data.csv")