In [1]:
import os 
import string
import re
from pathlib import Path
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 
pd.options.mode.chained_assignment = None # no warnings
# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
# tqdm
from tqdm import tqdm
# nltk
import nltk
from nltk.corpus import stopwords
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity # to calculate the similarities between keyword candidates and text
# pycountry
import pycountry
# contractions
import contractions
# spacy - library for advanced natural languages processing
import spacy 
os.system('python -m spacy download en') # used for keywords filling
nlp = spacy.load("en_core_web_sm") 
os.system('python -m spacy download xx_ent_wiki_sm') # used for locations filling
nlp_wk = spacy.load("xx_ent_wiki_sm")
# sentenceTransformer
from sentence_transformers import SentenceTransformer # For sentence embeddings
# import spellchecker
from spellchecker import SpellChecker
# import tensorflow, tensforflow_hub, keras
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
os.environ['TFHUB_CACHE_DIR'] = f'{str(Path.home())}/workspace/tf_cache' # path for downloading pre traind model

In [2]:
# reading in the CSV files using the pandas read_csv function. we have also dropped the id column from the train set as we won’t need this for training the model.
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
train = train.drop('id', axis=1)

In [3]:
def find_similarity(text_a, 
                    text_b, 
                    model = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1",
                    no_of_keywords=5):
    """
    
    """
    
    # load pretraind model for similarity
    embed = hub.load(model)
    tweet_embedding = embed([text_a])
    results_embeddings = embed(text_b)

    # calculate the similarity between document and results embeddings
    distances = cosine_similarity(tweet_embedding, results_embeddings)

    # get the top similar keywords
    keywords = [text_b[index] for index in distances.argsort()[0][-no_of_keywords:]]
    
    # get the indices of minimum element in numpy array
    keyword = keywords[np.where(distances == np.amin(distances))[0].tolist()[0]]
    
    return keyword

In [4]:
def text_formatting(text,
                    correct_spelling=True, 
                    remove_emojis=True, 
                    remove_stop_words=True):
    """
        Description: Apply function to clean a given text.
        
        :param tweet:
        :param correct_spelling:
        :param remove_emojis:
        :param remove_stop_words:
        
        Example: 
        input: 'Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J'
        output: 'barbados bridgetown jamaica  two cars set ablaze santa cruz  head elizabeth police superintend'
    """ 
    
    def correct_spellings(text, spell=SpellChecker()):   
        """
            Correct the missplled words of a given tweet
        """
    
        text = text.split()
        misspelled = spell.unknown(text)
        result = map(lambda word : spell.correction(word) if word in misspelled else word, text)

        return " ".join(result)
    
    text = text.lower().strip()
    
    # remove urls
    text = re.compile(r'https?://\S+|www\.\S+').sub(r'', text)
    
    # remove html tags
    text = re.compile(r'<.*?>').sub(r'', text)
   
    # using contractions.fix to expand the shotened words
    text = contractions.fix(text).lower().strip()
        
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # check for spelling errors
    if correct_spelling: 
        text = correct_spellings(text)
    
    # remove emojis if any
    if remove_emojis:
        text = text.encode('ascii', 'ignore').decode('utf8').strip()
    
    # remove spaces
    text = re.sub(' +', ' ', text)

    # remove stop words (Examples of stop words in English are “a”, “the”, “is”, “are” and etc)
    if remove_stop_words:
        text = ' '.join([word for word in text.split(' ') if word not in nlp.Defaults.stop_words])
    print(text)
    return text

In [5]:
def extract_keyword(text,
                    keywords):
    """
        Description: Apply function to extract a keyword from a given text
        
        :param tweet:
        :param keywords: 
        :param model:
        :param no_of_keywords:
        
        Example: 
        input: ''
        output: ''
    """ 
    keyword = ""

    # check first if any word from the given text is already defined as a keyword somewhere else
    for k in text.split(' '):
        if k in keywords:
            keyword = k
    
    if not keyword:
        text_ = nlp(text)

        # custom list of parts-of-speech (pos) tags we are interested in
        pos_tag = ['VERB', 'NOUN", "ADJ", "PROPN']
        result = []

        # if the token pos tag matches one of the pos_tag, then add the text form of the token to result list
        for token in text_:
            if (token.pos_ in pos_tag):
                result.append(token.text)

        # find similarity between each possible keywords to the tweet itself by using a pre-traind model
        keyword = find_similarity(text, result)
    
    return keyword

In [6]:
def get_location(text):  
    """
        Description: Apply function to extract a location from a given text
        
        :param text:
        
        Example: 
        [1] input: 'tha kicks antiblight loan effort memphis'
            output: 'memphis'
        [2] input: 'mourning notices ny stabbing arson victims stir politics grief posters shira bank'
            output: 'New York'
        [3] input: 'mourning notices ny stabbing arson victims stir politics grief posters shira bank israel'
            output: 'Israel'
    """ 

    us_state_to_abbrev = {
        "Alabama": "AL",
        "Alaska": "AK",
        "Arizona": "AZ",
        "Arkansas": "AR",
        "California": "CA",
        "Colorado": "CO",
        "Connecticut": "CT",
        "Delaware": "DE",
        "Florida": "FL",
        "Georgia": "GA",
        "Hawaii": "HI",
        "Idaho": "ID",
        "Illinois": "IL",
        "Indiana": "IN",
        "Iowa": "IA",
        "Kansas": "KS",
        "Kentucky": "KY",
        "Louisiana": "LA",
        "Maine": "ME",
        "Maryland": "MD",
        "Massachusetts": "MA",
        "Michigan": "MI",
        "Minnesota": "MN",
        "Mississippi": "MS",
        "Missouri": "MO",
        "Montana": "MT",
        "Nebraska": "NE",
        "Nevada": "NV",
        "New Hampshire": "NH",
        "New Jersey": "NJ",
        "New Mexico": "NM",
        "New York": "NY",
        "North Carolina": "NC",
        "North Dakota": "ND",
        "Ohio": "OH",
        "Oklahoma": "OK",
        "Oregon": "OR",
        "Pennsylvania": "PA",
        "Rhode Island": "RI",
        "South Carolina": "SC",
        "South Dakota": "SD",
        "Tennessee": "TN",
        "Texas": "TX",
        "Utah": "UT",
        "Vermont": "VT",
        "Virginia": "VA",
        "Washington": "WA",
        "West Virginia": "WV",
        "Wisconsin": "WI",
        "Wyoming": "WY",
        "District of Columbia": "DC",
        "American Samoa": "AS",
        "Guam": "GU",
        "Northern Mariana Islands": "MP",
        "Puerto Rico": "PR",
        "United States Minor Outlying Islands": "UM",
        "U.S. Virgin Islands": "VI",
    }
    
    text_ = nlp_wk(text)
    
    loc = []
    
    for ent in text_.ents:
        if(ent.label_ == "LOC"):
            loc.append(ent.text)
            
    if not loc:
        abbrev_to_us_state = dict(map(reversed, us_state_to_abbrev.items()))
        for item in text.upper().split(' '):
            if item in abbrev_to_us_state: 
                loc.append(abbrev_to_us_state.get(item))
    
    if len(loc) >= 2:            
        # if more then 2 locations were found, then find similarity between each possible location to the text itself by using a pre-traind model
        location = find_similarity(text, loc)
    else:
        # else convert to string
        location = ''.join(loc)
        
    return location

In [None]:
import queue
from threading import Thread, Lock


def _thread_func(q, result):
    """
    Threaded function for queue processing
    """
    while not q.empty():
        # Fetch new work from the Queue
        work = q.get()
        try:      
            data = text_formatting(f"{work[1]}")
            # Store data back at correct index
            result[work[0]] = data
        except:
            result[work[0]] = {}
        # Signal to the queue that the task has been processed
        q.task_done()
    return True

   
all_raw_tweets = []

for i in range(len(train["text"])):
    all_raw_tweets.append(train['text'].iloc[i])

# Set up a queue to hold all the tweets
q = queue.Queue(maxsize=0)

# Use many threads (50 max, or one for each file)
num_threads = min(50, len(all_raw_tweets))

# Populating Queue with tasks
tweets = [{} for x in all_raw_tweets]

# Load up the queue with the raw text to get the format vesrion of each one
for i in range(len(all_raw_tweets)):
    # Need the index and the text in each queue item
    q.put((i,f"{all_raw_tweets[i]}"))

# Starting worker threads on queue processing
for i in range(num_threads):
    worker = Thread(target=_thread_func, args=(q, tweets))
    # Setting threads as "daemon" allows main program to exit eventually even if these dont finish correctly.
    worker.setDaemon(True)
    # Start worker thread
    worker.start()

# Wait until the queue has been processed
q.join()

print(raw_text)


got sent photo ruby alaska smoke wildfires pours school
emergency evacuation happening building street
13000 people receive wildfires evacuation orders california
deeds reason earthquake allah forgive
residents asked shelter place notified officers evacuation shelter place orders expected
damage school bus 80 multi car crash breaking
hill fire woodsafraid tornado coming area
flood ago myanmar arrived ago

people died heat wave far
ridiculousflood disaster heavy rain causes flash flooding streets manitou colorado springs areas

love skiing
car fast
haha south tampa getting flooded hah wait second live south tampa going going fuck flooding
forest fire near la range ask canada
london cool
man
love fruits
wayi eat shit
like pasta
ine office asia set ablaze
end
ny week
summer lovely
love girlfriend
rockyfire update california 20 closed directions lake county fire afire wildfires
wanted set chicago ablaze preaching hotel
gained 3 followers week know stats grow
try bring heavy metal
wonderful

heard church leaders kenya coming forward comment accident issue disciplinary measuresarrestpastorngangaalexalltimelow aww airplane accident going die cuties good job

omg horrible accident man died wings airplane
mickinyman theatlantic killed airplane accident night car wreck politics best
airplane accident
320 ir iceman aftershock djicemoon dubstep trapmusic dna ed dance ices
phone looks like car ship airplane accident terrible
shooting airplane accidentairplane crashes house colombia 12 people die accident

statistically risk getting killed cop dying airplane accident
feared killed pakistani air ambulance helicopter crash
drone airplane accident pilots worried use drones esp close vicinity airports
votejkt48id mbataweel rip unladen family members killed airplanes accident
leading emergency services boss welcomes new ambulance charity
feared killed pakistani air ambulance helicopter crash
new nanotech device able target destroy blood clotsomg believe rip bro airplane accident jetengi

short reading apocalypse 211023 spirit angel took enormous high mountainzonewolf123 liked youtube video minecraft night lucky block mod bob apocalypse wither 20 amp

ourmothermary short reading apocalypse 211023 spirit angel took enormous high mountain
minecraft night lucky block mod bob apocalypse wither 20 amp mod showcase popularmmos youtube
alexandrapullin apocalypse comes week know
red faction armageddon microsoft xbox 360 2011 read ebay
enjoyed liveaction attack titan time posters reminded freshly clean coiffed apocalypse
best movie seen armageddon
fittscott minecraft night lucky block mod bob apocalypse wither 20 amp mod showcase popularmmos view
oviedo peace love amp armageddon
long coat hand worn certainty armageddon bears sense occasion
bed time wake revolution armageddon start
going beat armageddon su hao got flawless try
hey az sign petition save wildhorses tantonationalforest rollingstone signaling order
phone spying hidden door nsa data mining software financial armageddo

alleged east bay serial arsonist arrested sanfrancisco
trusting iran stop terrorism like inviting arsonist join fire brigade telegraph
arsonists blamed blaze plastics recycling business adelaide pcaldicott7 reports 7newsadl
big burning true story arsonist missing girl
doofus diamorfiend join moves
delhi government provide free treatment acid attack victims private hospitals
israeli forces raid home alleged car attack suspect palestineliked youtube video slimebeast town salem win arsonistlocalarsonist diamorfiend legal system forgets


demi stans think heart attack sold 56 million copiessuspect latest theater attack psychological issues

spotlight paradise arsonist wniagospel arsonistmusiclocalarsonist guess shit thinking

breaking terror attack police post dhampirbreaking obama officials gave muslim terrorist weapon texas attack

dattomm funniest twitter feminists try attack head
localarsonist lao93 blasts accused yea yakub dies karachi heart attack mumbai

owner chicagoarea gay bar ad

detroit's interested win battle
fedex longer transport bioterror germs wake anthrax lab mishaps news phone apple mobile
happens battle block cbsbigbrother finally
fedex willing transport research specimens potential bioterror pathogens wake anthrax lab mishaps
ahl responds fedex longer transport bioterror germs wake anthrax lab mishaps
fedex stop transporting bioterror germs lab mishaps fedex stopped transporting certain research
fedex longer transport bioterror germs wake anthrax lab mishaps usatoday
world fedex longer transport bioterror germs wake anthrax lab mishaps
fedex longer transport bioterror germs usatoday
world fedex longer transport bioterror germs wake anthrax lab mishaps
kelly tomlinson mildmannered baseman great metropolitan team fights neverending battle hits fbi giants way
miss gary buses son plays dixie electronic green fiddle postbattle celebration sequence
sexydragonmagic come realization attention span mass battle games painting playing
house energy amparo commerc

baseballquotes1 32 inch dynasty
omgbethersss bethanymota haha love
slit throat apologize bleeding
cspanwj 90blksamp8whts colluded 2 usage auto hostageamp2 look bulk wbioterrorismampuse glory iris id
asukager magical bag blazing
nose bleeding like 10 years ago
apparently bleeding people look weird lol fine walking
eating takes rubbing eyes hands eyes bleeding tearsbleeding typewriter day far written bunch gunk

follow edwelchmusic check hit single unpacked man blazing
turn radios stoponesounds live airwaves amp 1079 stickynyc roots blazing hits
stab promise bleeding
officialtjonez lost words new fan fam crazy skills blessed blazing dude love respect
thinking stepped broken glass pun tak sedan feel pain bleeding shit
blazing elwood blazingelwoods bother drugs song tune
stationcdrkelly support sys 4 usage auto taken hostage bulk clergyforced 2 exist youngerampgrossly disfigured bioterrorism
hit foot toe bleeding
joe landline gel stop bleeding instantly arizona realestate
aphiabeta1907 ugl

blizzarddraco lonewolffur like link
man somebody got stop dude fuckin funny bloodwinds gypsy blood time

kind shit nasty blood pun intended
hoe blood
sexual revolutionblight women stories acebabes healthweekly1 amateurnesterinnocent blood sons daughters land polluted psalms 10638 help stop sin abortion

infected bloody ear piercings fun
se pone cantar crying lightning
scotto519 happy birthday young blood
anthony runs blood
rules welcome read free chapter new book encounters jesus hope
day excellent dangerousbeans porridge seriously people blood orange porridge phenomenal
big stab deep stab like blood overthe
private thirsty nights blood rock roll
add items everyday eating habits research blood
nightmare elm street getting remade
aggressive bloody aggressive
listening bloody jay
got try let bloody things suh
loboparanoico mad men
bloody hell day tired thought vaca help
bloody insomnia error insomnia
fantosex suck bloody getting means amends
know effects low amp fast son product acne cre

editaxohaze let bagging bodys begin lol cuffed bad
mad going tell body bagging
rhee1975 deliciousvomit saying lucky home families body bag
hobo hobo vintage shira convertible 23800 bestseller
child shoulder bags pic shoulder book bag cartoon cross body bags girls
bestseller fossil dawson mini cross body bag es 9800
womens buckle casual stylish shoulder handbags pockets cross body bags green
body bags 08072015 0730 fremont music hall charlotte concert
mophead instar johnson problem game body bagging niggas vuzuhustle
bags trunk body
womens buckle casual stylish shoulder handbags pockets cross body bags white
breaking fairfax county firefighter placed admin leave amid probe facebook post putting police body bags dept says
attention rich football players coffins body bags locker rooms grab tommorow going die
womens flower printed shoulder handbags cross body metal chain satchel bags blue
aubrey bodybagging meek
bomairinge elutranscendent straight body bagging
slikrickdarula drake body bag

japan marks anniversary hiroshima atomic bombing
christopherszen hunterlove1995 black yeah man movie bombed hard
like survive atomic bombing hiroshima
oh fuck bill clinton bombing fuck nato
country claiming moral high ground dropped atomic bombs hiroshima banthebomb
today day hiroshima got atomic bomb 70 years ago sanitised narrative hiroshima atomic bombing
japan marks anniversary hiroshima atomic bombing ap
nbcnews yea bombing pearlharbor good idea
snapharmony bells toll hiroshima japan marks 70 years atomic bombing
australian ashes disaster collapse unfolded trent bridge cricket
fuck going trent bridge reminds england collapse caribbeanjapan marks anniversary hiroshima atomic bombing bells tolled hiroshima thursday japan marked 70

2 injured 1 missing bridge collapse central mexico
marshall plan united states bambina mojo prison oped post
moscowghost sayedridha amitabh congratulations capturing besieged city 3 months indiscriminate bombing land amp air
giant cranes holding bridge co

look sports riots google couches burned got
look silver lining barn having burned ground moon minute
wildlionx3 burnednews reports missing buildings fire people shot etc videos ppsellsbabyparts

hgf52611 uh huh burned know fire hurts robbie ross throw home run derby star game reasonremembered mcdonalds burned coolest play ground amp new got shit video games

fire burning pendleton burned 300 acres smoke reported drifting temeculathreealarm fire destroys residential buildings car manchester sunday afternoon

burned twice flame
burned dog finds new home young burn victim
sure away fire fighting king stevie amp crusty photo
hurts eat burned tounge pepperoni yesterday
spent 15 minutes lifting weights 43 calories burned loveit
fire burns multiple buildings montgomery co tips childcare expensive amp rain join jennasjems patrickwsls
spent 17 minutes walking innkeeper 90 calories burned loveit
sister burned boyfriend clothes recorded amp sent
holy fuck uvc bitch got burned hard
burned today ov

dacherryontop13 oh bush fires spain like year time went swimming planes getting water fight
movie theater attackclose home time thankful casualties life allow evil win
catfighting robots reduce civilian casualties calling ban premature ieve spectrum
kshllcenterpri1 progress4ohio burning buildings mean like burnt black churches
johnfromcranber pleas global warming work california australia having catastrophic bush fires
afghanistan youn reports recordhigh levels civilian casualties news afghanistan united
catfighting robots reduce civilian casualties callingpublic health team traumatised bush fires appreciativeinquiry turn

civilian casualties afghanistan reach record high
libertarianluke honest people want rampage let use hands feet casualties
day passed thankfully central command confirmed new casualties pray
news wrap un warns female child casualties rise afghanistan pas iraq
revise death america scenario 500 american casualties iranian activity suspected
drought fuels bush fires jam

chemical brothers play armory tomorrow night emergency bay area ed announcement chem
illinois emergency units simulate chemical explosion nu havethis hazmat
alaskan wolves face catastrophe denali wolves population plummeted 48 savedenaliwolves tweetstorm
myvintagesoul british upper class manservant pampered wealthy brit causes catastrophic shift reversal
kessler syndrome catastrophic exponential proliferation space debris destruction satellites gravitymovie
emergency crews respond chemical spill downtown beaumont renews
google alert emergency units simulate chemical explosion nu
new job opening downtown emergency service center seattle chemical dependency counselor intern jobs
downtown emergency service center hiring chemical dependency counselor intern apply seattle jobs
senschumer want netanyahu leading united states catastrophic religious world war
know bad bees pressure short term profit obsessed chemical companies
bomb crash loot riot emergency pipe bomb nuclear chemical spill gas

stepped outside drink cigarette immediately locked eyes jogger worlds collide
stars moon collide want life
gorpuazikinak tongue collapsed cum puddle body covered
rokiieee game officially collapsed
decry wrong trusty actually considering spontaneously collapsed trusty
super awkward worlds collide
let collide untill fill space
remember driving singing collide
mgnafrica pin263789f4 correction tent collapse story correction tent collapse story wizkidayo
efs300 star wars star trek collide pluto moon charon pluto
sounds right building thunderstorm inside day air masses collide
pierce veil rubber bracelet wristband collide sky read ebay
organicallyrude 1rockstar62 wish mattingly amp bundy amp mcguire standing collapsed
marvel comics imagecomics darkhorsecomics idwpublishing enabling possible collapse industry
check new song collide live bowery electric
dont want touchdown want worlds collide
students collide frist register nlccollidethehighfessions friend came school blasted asked high said p

sit rant snapchat apparent fans 8000 followers hope train crash xoxo
man piking crash
motorcyclist bicyclist injured denver collision broadway kierannicholson
daewony0406 alright going crash exhausted
crash burn
let love crash burn
713pm mapleridge laughed eb closed 203rd dewey trunk red collision eto 800 900
panel crash course course
photoshop csi crash course course
night going come crash party invited feelin2 fav worlds collided thanks lennonparham jessicastclair found gilmoreguysshow podcast ihave44episodesofgg notjoke

fatal crash reported johns island
police respond crash find suspected heroin
bought meinlcymbals 18 medium crash hey meinlcymbals endorsement starting expensive
photoshop tools crash course complete photoshop tool guide course
kinetic typography crash course effects video course
dat liable fuck crash
allah world news cop pulls man car avoid
son sleep night finally laid bed crashed
crash test trailer paul cheer amp rob rebels comedy special recorded
bin laden family 

cyclone derives powers calm center person norman vincent peace
new tropical cyclone forming near guam formed called mojave
cyclone komen devastates families myanmar week need help today
daviskawalya know mauryn143 saying final goodbyes grandpa seen news rip open ideas ve curfew
excited cyclone football
bank manager asks tom interview cyclone tom loan given purchase bicycle
news need plants pacific cyclone seasons help
like cyclone imperialism spins globe militarism crushes peoples sucks blood like
thecomedyquote 50shadezofgrey thirst curfew p45perez
future vaatu cyclone pam encouraged traditional ways living
ayekoradio play brushwork agency crushed shaken radio relabel electronicmusic listen music
1970 mercury cyclone hood moulding nice core cobra jet 429cj
talent misdirection cyclone pass ignite pass vanishing drive phantom shot kurokobot
great condition aston cyclone softball bat fastpitch 9 2920 sk398
come asia radio stations bangladesh broadcasting programs address upcoming cyclone

carsonrex spaceangelseven check rockin preview claytonbryant danger zone coming soon artistsunitedway seemeth right unto man end thereof ways death

feel like death
silent0siris awesome norse landscapes loads atmosphere life boringdead snotgreen wastelands
kellkane thanks narrowly averted death fun right
year later ferguson sees change asks real
liked youtube video jeromekem hazard death sport
hate talking grandma mean love death talk damn ssshhheeesshh
ass judge orders texas recognize spouse siamese death certificate
ted cruz bashes obama comparison gop iranians shouting death america
going starve death
turnedonfetaboo hsjb215 check rockin preview claytonbryant danger zone coming soon artistsunited
atchisonsean dead
cyprus news analysis mullah mars death split talibans ranks mars demise certainly lead
new crime knowing rights punishable death
going gainesville death
banning sake presuppose reborn attained individual rightsgenerally court law forbids killing dead person
papiichampoo en

wrinkled face deluge decayed
deluge eulogies cecilthelion whatsapp getthis tormented soul matias xavier
uk deluge canadian themed tops aroundthe timing perfect believe mad
fiendnikki deluge awesome word idea like
joshsternberg feed deluge twice week fantastic
glimpses hyderabad deluged heavy rainfall
tomorrow internet day 2 months look forward deluge stuff avoiding downright lieifl playing deflategate perfectly deluge incremental stories bored world caring wanting away

search powerful content improve business frustrated deluge quantity
getting tons telemarketing calls cell phone deluged
businesses deluged invoices stand colour shape likely rise pay pile
showman save big easy deluge supernatural monstrosities read free
theburnageblue yes man having bad week far events deluge fans turned right
businesses deluged invoices stand colour shape likely ris tothe pay pile
historic applied deluge recently exposed childsexabuse truly historic scale coverup
photo forrestmankins colorado camping
s

In [None]:
# Preprocess data:
# Steps:
# [1] Format tweet: correct spelling, remove emojis, and remove stop_words
# [2] Format keyword: fill missing keywords for certain tweets following specific scenarios
# [3] Format location: fill missing locations for certain tweets following specific scenarios

# get all available keywords from the the data (unique values) for step [2]
train_keywords = train['keyword'].unique()

# for i in tqdm(range(len(train["text"]))):
for i in range(len(train["text"])):
    # Step 1 - tweet text formatting
    train['text'].iloc[i] = text_formatting(train['text'].iloc[i])
    # Step 2 - keyword formatting
    if pd.isnull(train['keyword'].iloc[i]):
        # keyword is empty, so search for a keyword within the tweet itself, if no keyword is found then fill with NaN
        try:
            train['keyword'].iloc[i] = extract_keyword(tweet=train['text'].iloc[i], keywords=train_keywords)
        except:
            train['keyword'].iloc[i] = "NaN"   
    # Step 3 - location formatting
    if pd.isnull(train['location'].iloc[i]):
        # location is empty, so search for a location within the tweet itself, if no location is found then fill with NaN 
        location = get_location(train['text'].iloc[i])
        if location:
            # location found within the tweet
            train['location'].iloc[i] = location
        else:
            # location is not found within the tweet
            train['location'].iloc[i] = "NaN"
    else:
        # location is not empty, so first make sure there's no a location within the tweet itself
        location = get_location(train['text'].iloc[i])
        if location:
            # location found within the tweet, so overwrite the value under location with it
            train['location'].iloc[i] = location
        else:
            # location is not found within the tweet, so make sure the given location is a legit one and not some garbage text
            location = get_location(train['location'].iloc[i])
            if not location:
                # location is not legit, replace it with NaN
                train['location'].iloc[i] = "NaN" 

In [None]:
hub_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2",
                           input_shape=[], dtype=tf.string)

model = keras.Sequential()
model.add(hub_layer)
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
# split the train data into a training and test set
X = train.drop('target', axis=1)
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
### LR
# The below code creates a logistic regression model that performs the defined transformations before fitting or predicting.
lr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])
# model score: 0.731
    
### KNN
# from sklearn.neighbors import KNeighborsClassifier
# lr = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', KNeighborsClassifier(3))])
# model score: 0.668

### DecisionTreeClassifier
# from sklearn.tree import DecisionTreeClassifier
# lr = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', DecisionTreeClassifier(max_depth=5))])
# model score: 0.587

### RandomForestClassifier
# from sklearn.ensemble import RandomForestClassifier
# lr = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1))])
# model score: 0.563

### SVC
# from sklearn.svm import SVC
# lr = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', SVC(kernel="linear", C=0.025))])
# model score: 0.582

### LinearDiscriminantAnalysis
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# lr = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto'))])

In [None]:
lr.fit(X_train, y_train)
print("model score: %.3f" % lr.score(X_test, y_test))

In [None]:
test_no_id = test.drop('id', axis=1)
test_predictions = lr.predict(test_no_id)

In [None]:
tweet_id = test['id']
submission_df_1 = pd.DataFrame({
                  "id": tweet_id, 
                  "target": test_predictions})

In [None]:
submission_df_1.to_csv('submission_1.csv', index=False)