In [1]:
import os 
import string
import re
from pathlib import Path
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 
pd.options.mode.chained_assignment = None # no warnings
# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
# tqdm
from tqdm import tqdm
# nltk
import nltk
from nltk.corpus import stopwords
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity # to calculate the similarities between keyword candidates and text
# pycountry
import pycountry
# contractions
import contractions
# spacy - library for advanced natural languages processing
import spacy 
os.system('python -m spacy download en') # used for keywords filling
nlp = spacy.load("en_core_web_sm") 
os.system('python -m spacy download xx_ent_wiki_sm') # used for locations filling
nlp_wk = spacy.load("xx_ent_wiki_sm")
# sentenceTransformer
from sentence_transformers import SentenceTransformer # For sentence embeddings
# import spellchecker
from spellchecker import SpellChecker
# import tensorflow, tensforflow_hub, keras
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
os.environ['TFHUB_CACHE_DIR'] = f'{str(Path.home())}/workspace/tf_cache' # path for downloading pre traind model
import queue
from threading import Thread

In [2]:
# reading in the CSV files using the pandas read_csv function. we have also dropped the id column from the train set as we won’t need this for training the model.
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
train = train.drop('id', axis=1)

In [3]:
def find_similarity(text_a, 
                    text_b, 
                    model="https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1",
                    nu_of_keywords=5):
    """
        Description: Find similarity of 2 given text using a pre-trained model
        
        :param text_a [string]: text 1
        :param text_b [list]: text 2
        :param nu_of_keywords: 
    """
    
    # load pretraind model for similarity
    embed = hub.load(model)
    text_embedding = embed([text_a])
    results_embeddings = embed(text_b)

    # calculate the similarity between document and results embeddings
    distances = cosine_similarity(text_embedding, results_embeddings)

    # get the top similar keywords
    keywords = [text_b[index] for index in distances.argsort()[0][-no_of_keywords:]]
    
    # get the indices of minimum distance in numpy array
    keyword = keywords[np.where(distances == np.amin(distances))[0].tolist()[0]]
    
    return keyword

In [4]:
def text_formatting(text,
                    correct_spelling=True, 
                    remove_emojis=True, 
                    remove_stop_words=True):
    """
        Description: Apply function to clean a given text.
        
        :param tweet:
        :param correct_spelling:
        :param remove_emojis:
        :param remove_stop_words:
        
        Example: 
        input: 'Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J'
        output: 'barbados bridgetown jamaica  two cars set ablaze santa cruz  head elizabeth police superintend'
    """ 
    
    def correct_spellings(text, spell=SpellChecker()):   
        """
            Correct the missplled words of a given tweet
        """
    
        text = text.split()
        misspelled = spell.unknown(text)
        result = map(lambda word : spell.correction(word) if word in misspelled else word, text)

        return " ".join(result)
    
    text = text.lower().strip()
    
    # remove urls
    text = re.compile(r'https?://\S+|www\.\S+').sub(r'', text)
    
    # remove html tags
    text = re.compile(r'<.*?>').sub(r'', text)
   
    # using contractions.fix to expand the shotened words
    text = contractions.fix(text).lower().strip()
        
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # check for spelling errors
    if correct_spelling: 
        text = correct_spellings(text)
    
    # remove emojis if any
    if remove_emojis:
        text = text.encode('ascii', 'ignore').decode('utf8').strip()
    
    # remove spaces
    text = re.sub(' +', ' ', text)

    # remove stop words (Examples of stop words in English are “a”, “the”, “is”, “are” and etc)
    if remove_stop_words:
        text = ' '.join([word for word in text.split(' ') if word not in nlp.Defaults.stop_words])

    return text

In [None]:
# Preprocess data:
# Steps:
# [1] Format tweet: correct spelling, remove emojis, and remove stop_words
# [2] Format keyword: fill missing keywords for certain tweets following specific scenarios
# [3] Format location: fill missing locations for certain tweets following specific scenarios

# get all available keywords from the the data (unique values) for step [2]
train_keywords = train['keyword'].unique()

def _thread_func(q, result):
    """
    Threaded function for queue processing
    """
    while not q.empty():
        # Fetch new work from the Queue
        work = q.get()
        try:
            # index, keyword, location, tweet, target
            i, k, l, tw, ta = work[0], work[1], work[2], work[3], work[4]     

            # Step 1 - tweet text formatting
            tw = text_formatting(tw)

            # Step 2 - keyword formatting
            if pd.isnull(k):
                # keyword is empty, so search for a keyword within the tweet itself, 
                # if no keyword is found then fill with NaN
                keyword = extract_keyword(tweet=tw, keywords=train_keywords)
                k = keyword if keyword else "NaN" 

            # Step 3 - location formatting
            if pd.isnull(l):
                # location is empty, so search for a location within the tweet itself, 
                # if no location is found then fill with NaN 
                location = get_location(tw)
                l = location if location else "NaN"     
            else:
                # location is not empty, so first make sure there's no a location within the tweet itself
                location_tweet, location_legit = get_location(tw), get_location(l)
                # scenarios:
                # - location found within the tweet, so overwrite the value under location with it
                # - location is not found within the tweet, so make sure the given location not some garbage text
                # - location is not legit, replace it with NaN
                l = location_tweet if location_tweet else location_legit if location_legit else "NaN"

            # Store data back at correct index
            result[i] = (k, l, tw, ta)
        except:
            result[i] = {}
        
        # Signal to the queue that the task has been processed
        q.task_done()
    
    return True
 
all_raw_tweets = []

for i in range(len(train)):
    all_raw_tweets.append(train.iloc[i])


# Set up a queue to hold all the tweets
q = queue.Queue(maxsize=0)

# Use many threads (50 max, or one for each file)
num_threads = min(50, len(all_raw_tweets))

# Populating Queue with tasks
tweets = [{} for x in all_raw_tweets]

# Load up the queue with the raw text to get the format vesrion of each one
for i in range(len(all_raw_tweets)):
    # extract keyword, location, tweet, and target, and put as a queue item with id
    k, l, tw, ta = all_raw_tweets[i][0], all_raw_tweets[i][1], all_raw_tweets[i][2], all_raw_tweets[i][3]
    q.put((i, k, l, tw, ta))

# Starting worker threads on queue processing
for i in range(num_threads):
    worker = Thread(target=_thread_func, args=(q, tweets))
    # Setting threads as "daemon" allows main program to exit eventually even if these dont finish correctly.
    worker.setDaemon(True)
    # Start worker thread
    worker.start()

# Wait until the queue has been processed
q.join()

residents asked shelter place notified officers evacuation shelter place orders expected
emergency evacuation happening building street
flood disaster heavy rain causes flash flooding streets manitou colorado springs areas
got sent photo ruby alaska smoke wildfires pours school
deeds reason earthquake allah forgive
haha south tampa getting flooded hah wait second live south tampa going going fuck flooding
13000 people receive wildfires evacuation orders california
forest fire near la range ask canada
car fast
damage school bus 80 multi car crash breaking
flood ago myanmar arrived ago
hill fire woods
love fruits
afraid tornado coming area
london cool
people died heat wave far
wonderful day
ridiculous
wayi eat shit
summer lovely
ine office asia set ablaze
man
ny week
try bring heavy metal
outside ablaze alive dead inside
love skiing
crying set ablaze
plus look sky night ablaze
like pastalove girlfriend

end
cool
ablaze lord
rockyfire update california 20 closed directions lake county fir

INFO:absl:Using /home/guinzburg/workspace/tf_cache to cache modules.


wanted set chicago ablaze preaching hotel
awesome time visiting cfo head office ancor site ablaze thanks tita vida taking care
check
night retainers weird better wear single night year
bbcmtd wholesale markets ablaze
ooooooh
building perfect blacklist life leave streets ablaze
phdsquares muff built hype new acquisitions doubt set el ablaze season
barbados bridgetown jamaica cars set ablaze santa cruz head elizabeth police superintend
goooooooaaaaaal
soooo pumped ablaze southridgelife
santa cruz head elizabeth police superintendent sanford salmon
africanbaze breaking newsnigeria flag set ablaze aba
deputies man shot brighton home set ablaze
man wife years jail setting ablaze niece
sky ablaze tonight los angeles expecting ig fu filled sunset shots know peepsprogressive greetings month students set pens ablaze torch publications

accident knew gon happen
west burned thousands wildfires ablaze california climate energy
set hearts ablaze city gift skyline like kiss lips
kids cuz got bicycle

feared killed pakistani air ambulance helicopter crash reuters yovani
omg believe rip bro airplane accident jetengine turbojet boing
pakistan air ambulance helicopter crash kills
ambulance sprinter automatic frontline vehicle choice 14 lez compliant ebay
feared killed pakistani air ambulance helicopter crash
horrible accident man died wings airplane29072015
news feared killed pakistani air ambulance helicopter crash pillow dna
tanslash waiting ambulance
feared killed pakistani air ambulance helicopter crash
ambulance sprinter automatic frontline vehicle choice 14 lez compliant ebay
ambulance sprinter automatic frontline vehicle choice 14 lez compliant ebay
fouseytube ok need ambulance hahaha good
horrible accident man died wings airplane 29072015 wif cant believe eyes
reuters feared killed pakistani air ambulance helicopter crash
feared killed pakistani air ambulance helicopter crash
feared killed pakistani air ambulance helicopter crash
know way ambulance coming lilt
ambulance right o

phone spying hidden door nsa data mining software financial armageddon blog
plan temporary300 abyss armageddon kill flags fast reason
gods kingdom heavenly govt rule people earth armageddon
armageddon
katiekatcubs know shit goes world series armageddon
janenelson097 stephenscifi adaptation watch charlie humans apocalypse optioned film sciencefiction
ecker eep thought yesterday saw hella scary hail armageddon
latest bryansinger reveals storm queen apocalypse rural alexshipppp
news ben affleck know wifekids girls help loved armageddon eonlinechat
photo sketch based taste armageddon episode started tos
sadly windows 10 reveals microsoft ethics armageddon
karnythia niece gaining ability stand getting prepared toddler apocalypse armageddon
plan temporary300 russaky89 armageddon kill flags fast reasonchristians united israel cuff jews convert soon die armageddon

european fit till christmas armageddon
lets talk goof guild saunders come right stage
check preppers doomsday library collection a

credit pfannebeckers inspiring rediscover fantabulous tit
left hand diamond graveyard shift attack defend right handsome fucking idiot
israeli forces raid home alleged car attack suspect palestine
horrific attack wife muslim italy liveleak news
localarsonist lao real live
arsonist arrested setting fires watch tonights headlines nightbeat veronicadlcruz 2minutemixvolleyball attack ii volleyball training machine sets simulation

suspect latest theatre attack psychological issuesgoing lie kind ready attack senior year

stay vigilant civil liberties constant attack nativehuman religion
new post darkreading new sub relay attack steals user credentials internet
notes tactful direct response harpers attack albert's govt hell yeah premier able cannoli
strongly condemn attack ary news team karachi cowardly act simply trying job
nashville theater attack gun grabbers demand hatchet control
fact atomic bombs called little boy fat man says lot mentality went attack
caixxum5sos thanks damn heart att

fedex longer transport bioterror germs wake anthrax lab mishaps
house energy amparo commerce subcommittee hold 728 hearing cac oversight bioterror labs army anthrax mishaps htt
fedex longer transport bioterror germs wake anthrax lab mishaps usatoday
fedex longer ship potential bioterror pathogens fedex corp nose fix longer deliver packages
fedex longer ship potential bioterror pathogens atlbizchron
jax biz journal fedex stops shipping potential bioterror pathogens
thank fedex longer shipping live microbes department defense
fedex stops shipping potential bioterror pathogens trucking
lonepine remembered australia descendants grow 666canberra gallipoli wwe
mathis problem researchers fedex longer transport select agents usatoday
fedex longer transport bioterror germs usatoday
fedex longer transport bioterror germs wake anthrax lab mishaps
fedex longer transport bioterror germs wake anthrax lab mishaps
fight bioterrorism sir
biolab safety concerns grow fedex stops transporting certain spec

thinking stepped broken glass pun tak sedan feel pain bleeding shit
follow edwelchmusic check hit single unpacked man blazing
cute dinner date til cams nose starts bleeding
bleeding silence feel safer violence
funkylilshack mariaf30 want bitch slapping guns blazing cake throwing charles showdown worth waitdeadpool favourite marvel characters know wears red suit bad guys tell bleeding

coreyashe look broken bleedingdarrylb1979 yea heard whatnot coming 2017 2019 vampire bleeding

jaydennotjared help hope ok text need talk sending hugs way bleeding death allowed
kingnaruto long radar bleeding good
ear started bleeding
let good soccer bleeding yo face
ears bleeding hate stefano
jannet2208 fell hit head concrete bleeding shit
uptownjorge head like yo nose bleedingbad things happen reason wise words going stop te bleeding

artisteoftheweekfact conversations coast2coastdjs agree crystalblaz jiwonle hiphop clubbanger
sodamntrue know bleeding heart wannabe pickup artist
beckyfeigin defs stops b

sethalphaeus personal favorites include paramour muse green day royal blood sos
weekends bloody mary times summers new
ronda rouse close making floyd mayweather money 50 fights bloody elbow boxing
need life sin girlfriend ride till bloody end girlfriend
looooooooooool bloody hell
eh hello cover bloody thighs bloody clear eh hello expose cleavage
fantosex suck bloody getting means amends
bloody mary sink beet juice
meet bloody rse
itsmegss think bloody barking
black friday turns bloody shopping mystery
believe people mid high blood pressure life stressful decisionsondecisions
friday supposed happy day bloody friday hah jazz
mrtophypup bloody sexy drools
entered win entire set butterlondon lip crayons beams enter bloggers
marlon williams elvis presley marlon williams steel panther shuffle mode like bloody legend
lauradeholanda forrest version 83 bloody awful
slsandpet hey sally sorry emailed awol bloody work argh resigninshame
aunt marge blown heks
tradcatknight 1 russia played reason li

aubrey bodybagging meek
broseidonrex dapurplesharpie skimmed twitter missed body bagging
slikrickdarula drake body bagging peeps man bout
womens cross body messengers bags clutch small shoulders zippers bags white
mom 2015 summer contrast candy color topknot cross body tote shoulder bags homegirls mom zip po
foxes jengriffinfnc report dangerous activity tell body bags arrival
update va firefighter administrative leave facebook post calls people officers body bags
macdaddyleo caption needed freshman nigga stacey body bagging niggas
womens satchel lattice chain studded cross body multi colour shoulder bags blue
lab today ready body bags
deeeznvtzzz bring body bags tho
bitches takin pics bags bigger body
womens tote faux leather handbags shoulder cross body handle bags rose
basked body bags
womens buckle casual stylish shoulder handbags pockets cross body bags green
mzgraciebaby record jumpin window early got officialrealrap body bagging luck lol save file
womens handbags cross body geome

japan thursday marks anniversary atomic bombing hiroshima senior official washington schedule
giant cranes holding bridge collapse nearby homes
sioux city fire officials believe bridge collapse lead cement truck roll siouxland matters siouxlan
2 injured 1 missing bridge collapse central mexico fox news latino
japan marks anniversary hiroshima atomic bombing generalnewsgiant cranes holding bridge collapse nearby homes

setting4success bells toll hiroshima japan marks 70 years atomic bombing news smallbusiness entrepreneur
giant cranes holding bridge collapse nearby homes
australian ashes disaster collapse unfolded trent bridge telegraph
swiss kosher hotel bridge collapse injures people jewishness
giant cranes holding bridge collapse nearby homes
giant cranes holding bridge collapse nearby homes
knew relationship kansas city hyatt bridge collapse amp alias cote huffpostarts
ashes 2015 australia collapse trent bridge twitter reacted
listening blowers suffers aussie batting collapse trent 

chicken nuggets microwave 5 minutes instead 1 accident fucking burned
joked wood burned working innings cubital
metal cutting sparks brush fire brighton brush fire sparked landowner cutting metal burned 10
stars burning voice mind
bitches walking like hot shit amp got bed bugs amp burning
burned 129 calories 24 minutes walking 35 mah brisk pace myfitnesspal
look silver lining barn having burned ground moon minute
watch bad fool burned coverage year dat dude allure practice squad material
kennethbauer like coffee noodles burned
burned 202 calories 30 minutes walking 40 mah brisk pace myfitnesspal
bored life morning burning desire things goals lou holt
bulletproof black like funeral world burning cold
burning legion returned
leader zionism stop burning babies
band buildings fire playing bbcintroducing purpleturtlerdg wednesday giantgiantsound rigburning like neon lights

501 sky news tandem totteham going maddddd burning fed cars dem ting dere
raining outside burning favorite candle hot 

In [None]:
# # Preprocess data:
# # Steps:
# # [1] Format tweet: correct spelling, remove emojis, and remove stop_words
# # [2] Format keyword: fill missing keywords for certain tweets following specific scenarios
# # [3] Format location: fill missing locations for certain tweets following specific scenarios

# # get all available keywords from the the data (unique values) for step [2]
# train_keywords = train['keyword'].unique()

# for i in range(len(train["text"])):
#     # Step 1 - tweet text formatting
#     train['text'].iloc[i] = text_formatting(train['text'].iloc[i])
    
#     # Step 2 - keyword formatting
#     if pd.isnull(train['keyword'].iloc[i]):
#         # keyword is empty, so search for a keyword within the tweet itself, if no keyword is found then fill with NaN
#         keyword = extract_keyword(tweet=train['text'].iloc[i], keywords=train_keywords)
#         train['keyword'].iloc[i] = keyword if keyword else "NaN" 
      
#     # Step 3 - location formatting
#     if pd.isnull(train['location'].iloc[i]):
#         # location is empty, so search for a location within the tweet itself, if no location is found then fill with NaN 
#         location = get_location(train['text'].iloc[i])
#         train['location'].iloc[i] = location if location else "NaN"     
#     else:
#         # location is not empty, so first make sure there's no a location within the tweet itself
#         location_tweet, location_legit = get_location(train['text'].iloc[i]), get_location(train['location'].iloc[i])
#         # scenarios:
#         # - location found within the tweet, so overwrite the value under location with it
#         # - location is not found within the tweet, so make sure the given location is a legit one and not some garbage text
#         # - location is not legit, replace it with NaN
#         train['location'].iloc[i] = location_tweet if location_tweet else location_legit if location_legit else "NaN"

In [None]:
hub_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2",
                           input_shape=[], dtype=tf.string)

model = keras.Sequential()
model.add(hub_layer)
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
# split the train data into a training and test set
X = train.drop('target', axis=1)
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
lr.fit(X_train, y_train)
print("model score: %.3f" % lr.score(X_test, y_test))

In [None]:
test_no_id = test.drop('id', axis=1)
test_predictions = lr.predict(test_no_id)

In [None]:
tweet_id = test['id']
submission_df_1 = pd.DataFrame({
                  "id": tweet_id, 
                  "target": test_predictions})

In [None]:
submission_df_1.to_csv('submission_1.csv', index=False)