In [236]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [237]:
args = Namespace(
    train_dataset_csv="train.csv",
    test_dataset_csv="test.csv",
    train_proportion=0.7,
    val_proportion=0.3,
    train_split_csv="train_with_splits.csv",
    seed=523
)

In [238]:
train=pd.read_csv(args.train_dataset_csv)
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [239]:
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [240]:
train[train.keyword.notnull()]

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...,...
7578,10830,wrecked,,@jt_ruff23 @cameronhacker and I wrecked you both,0
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0
7581,10833,wrecked,Lincoln,@engineshed Great atmosphere at the British Li...,0


In [241]:
train.keyword.nunique()

221

In [242]:
train.target.sum()/len(train)

0.4296597924602653

In [243]:
train.location.nunique()/train.location.notnull().sum()

0.6576771653543307

In [244]:
# Splitting train by target
# Create dict
by_target = collections.defaultdict(list)
for _, row in train.iterrows():
    by_target[row.target].append(row.to_dict())

In [245]:
by_target

defaultdict(list,
            {1: [{'id': 1,
               'keyword': nan,
               'location': nan,
               'text': 'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
               'target': 1},
              {'id': 4,
               'keyword': nan,
               'location': nan,
               'text': 'Forest fire near La Ronge Sask. Canada',
               'target': 1},
              {'id': 5,
               'keyword': nan,
               'location': nan,
               'text': "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
               'target': 1},
              {'id': 6,
               'keyword': nan,
               'location': nan,
               'text': '13,000 people receive #wildfires evacuation orders in California ',
               'target': 1},
              {'id': 7,
               'keyword': nan,
               'location': nan,
               

In [246]:
# Create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_target.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    #for item in item_list[n_train:n_train+n_val]:
    for item in item_list[n_train:]:
        item['split'] = 'val'

    # Add to final list
    final_list.extend(item_list)

In [247]:
train_split_df=pd.DataFrame(final_list)
train_split_df

Unnamed: 0,id,keyword,location,text,target,split
0,9815,trauma,LOCAL ATLANTA NEWS 4/28/00 - 4/28/15 FREELANCER,@RaabChar_28 @DrPhil @MorganLawGrp How do you ...,0,train
1,4527,emergency,"Anchorage, AK",#Anchorage #Jobs Emergency Medicine - Nurse Pr...,0,train
2,6540,injury,,JOBOOZOSO: USAT usatoday_nfl Michael Floyd's h...,0,train
3,1456,body%20bagging,,WWE 2k15 MyCareer EP18 Tyrone body bagging dud...,0,train
4,10566,windstorm,she/her/your majesty/empress,I like the weird ones like Rain of Mystical or...,0,train
...,...,...,...,...,...,...
7608,9206,suicide%20bombing,,Suicide bombing is just the fear of dying alone,1,val
7609,2540,collision,"North Highlands, CA",Traffic Collision - Ambulance Enroute: Elkhorn...,1,val
7610,4747,evacuate,Nashville,@ahhtheenikki And from what I can tell- they r...,1,val
7611,3505,derailment,India,Madhya Pradesh Train Derailment: Village Youth...,1,val


In [248]:
def preprocess_text(text):
    if type(text) == float:
        print(text)
    text = text.lower()
    text = re.sub(r"([!,.?:;()-,+\~^|/*'=<>])", r" \1 ", text)
    text = re.sub(r"([$%&#])", r"", text)
    #text = re.sub(r"[^a-zA-Z0-9.,! ?;:-]+", r"&", text) #Replace all special characters with '&'
    text = re.sub(r"[0-9]+", r"9", text) #Replace all numbers with 9
    return text

In [249]:
text='IUIU, #klkllk. lk; jkjkk! lkklkl? jkjkjk:kjkj mm n@# -po_op 989 ghgh& ghhgh%/8*'

In [250]:
preprocess_text(text)

'iuiu ,  klkllk .  lk ;  jkjkk !  lkklkl ?  jkjkjk : kjkj mm n@ -po_op 9 ghgh ghhgh / 9 * '

In [251]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [252]:
train_split_df['text']=train_split_df.text.apply(preprocess_text)

In [253]:
train_split_df

Unnamed: 0,id,keyword,location,text,target,split
0,9815,trauma,LOCAL ATLANTA NEWS 4/28/00 - 4/28/15 FREELANCER,@raabchar_9 @drphil @morganlawgrp how do you s...,0,train
1,4527,emergency,"Anchorage, AK",anchorage jobs emergency medicine - nurse prac...,0,train
2,6540,injury,,joboozoso : usat usatoday_nfl michael floyd '...,0,train
3,1456,body%20bagging,,wwe 9k9 mycareer ep9 tyrone body bagging dudes...,0,train
4,10566,windstorm,she/her/your majesty/empress,i like the weird ones like rain of mystical or...,0,train
...,...,...,...,...,...,...
7608,9206,suicide%20bombing,,suicide bombing is just the fear of dying alone,1,val
7609,2540,collision,"North Highlands, CA",traffic collision - ambulance enroute : elkho...,1,val
7610,4747,evacuate,Nashville,@ahhtheenikki and from what i can tell- they r...,1,val
7611,3505,derailment,India,madhya pradesh train derailment : village you...,1,val


In [254]:
duble=train_split_df[train_split_df[['text']].duplicated(keep=False)].sort_values(by='text')
duble

Unnamed: 0,id,keyword,location,text,target,split
5500,9225,suicide%20bombing,,' suicide bombing at [location named] . . ....,1,train
7075,9207,suicide%20bombing,,' suicide bombing at [location named] . . ....,1,val
4631,10080,typhoon,REPUBLICA DOMINICANA,( losdelsonido ) obama declares disaster for...,1,train
5582,3905,devastated,REPUBLICA DOMINICANA,( losdelsonido ) obama declares disaster for...,1,train
3934,4076,displaced,Pedophile hunting ground,. potus strategicpatience is a strategy for g...,0,val
...,...,...,...,...,...,...
6908,8018,refugees,,wowo-- = = = 9 nigerian refugees repatriate...,1,val
300,8044,refugees,,wowo-- = = = 9 nigerian refugees repatriate...,0,train
7225,10754,wreckage,Punjab,wreckage ' conclusively confirmed ' as from ...,1,val
6427,10776,wreckage,India,wreckage ' conclusively confirmed ' as from ...,1,train


In [255]:
train_split_df.text.str.split(' ', expand=False)

0       [@raabchar_9, @drphil, @morganlawgrp, how, do,...
1       [anchorage, jobs, emergency, medicine, -, nurs...
2       [joboozoso, :, , usat, usatoday_nfl, michael, ...
3       [wwe, 9k9, mycareer, ep9, tyrone, body, baggin...
4       [i, like, the, weird, ones, like, rain, of, my...
                              ...                        
7608    [suicide, bombing, is, just, the, fear, of, dy...
7609    [traffic, collision, -, ambulance, enroute, :,...
7610    [@ahhtheenikki, and, from, what, i, can, tell-...
7611    [madhya, pradesh, train, derailment, :, , vill...
7612    [ah-mazing, story, of, the, power, animal, res...
Name: text, Length: 7613, dtype: object

In [256]:
train_split_df['keyword']=train_split_df['keyword'].fillna('_NaN_')

In [257]:
keyword_=pd.DataFrame(train_split_df.keyword.str.split('%20', expand=False))
keyword_

Unnamed: 0,keyword
0,[trauma]
1,[emergency]
2,[injury]
3,"[body, bagging]"
4,[windstorm]
...,...
7608,"[suicide, bombing]"
7609,[collision]
7610,[evacuate]
7611,[derailment]


In [258]:
keyword_.isnull().sum()

keyword    0
dtype: int64

In [259]:
keyword_['len_']=keyword_.keyword.apply(lambda x: len(x))

In [260]:
keyword_['len_'].max()

3

In [261]:
keyword_[keyword_['len_']==2]

Unnamed: 0,keyword,len_
3,"[body, bagging]",2
14,"[bridge, collapse]",2
22,"[body, bag]",2
23,"[blew, up]",2
25,"[mass, murderer]",2
...,...,...
7591,"[loud, bang]",2
7594,"[war, zone]",2
7598,"[nuclear, reactor]",2
7601,"[heat, wave]",2


In [262]:
def context (text: list, keyword: list, context_len: int):
    indices_all=[]
    for word in keyword:
        indices=[i for i, x in enumerate(text) if ((word in x) or (x in word))]
        indices_all.extend(indices)
        
    if len(indices_all)>0:
        min_i=min(indices_all)-context_len
        max_i=max(indices_all)+context_len+1
        if min_i<0:
            min_i=0
                    
        return text[min_i:max_i]
    
    else:
        
        return text
               
    
    

In [263]:
text=['ui', 'hu','trep', 'per', 'trepi', 'ytr', 'try', 'per', 'ytro', 'io', 'po', 'op', 'po']
keyword=['uiu']
context(text, keyword, 2)

['ui', 'hu', 'trep']

In [264]:
context_df=pd.DataFrame()

In [265]:
context_df['keyword']=train_split_df.keyword.str.split('%20', expand=False)
context_df['text']=train_split_df.text.str.split(' ', expand=False)
context_df

Unnamed: 0,keyword,text
0,[trauma],"[@raabchar_9, @drphil, @morganlawgrp, how, do,..."
1,[emergency],"[anchorage, jobs, emergency, medicine, -, nurs..."
2,[injury],"[joboozoso, :, , usat, usatoday_nfl, michael, ..."
3,"[body, bagging]","[wwe, 9k9, mycareer, ep9, tyrone, body, baggin..."
4,[windstorm],"[i, like, the, weird, ones, like, rain, of, my..."
...,...,...
7608,"[suicide, bombing]","[suicide, bombing, is, just, the, fear, of, dy..."
7609,[collision],"[traffic, collision, -, ambulance, enroute, :,..."
7610,[evacuate],"[@ahhtheenikki, and, from, what, i, can, tell-..."
7611,[derailment],"[madhya, pradesh, train, derailment, :, , vill..."


In [266]:
context_df['context']=context_df.apply(lambda x: context(x.text, x.keyword, 3), axis=1)
context_df['len']=context_df.context.apply(lambda x: len(x))

In [267]:
context_df

Unnamed: 0,keyword,text,context,len
0,[trauma],"[@raabchar_9, @drphil, @morganlawgrp, how, do,...","[do, you, self-inflict, a, wound, to, your, si...",21
1,[emergency],"[anchorage, jobs, emergency, medicine, -, nurs...","[anchorage, jobs, emergency, medicine, -, nurs...",34
2,[injury],"[joboozoso, :, , usat, usatoday_nfl, michael, ...","[joboozoso, :, , usat, usatoday_nfl, michael, ...",26
3,"[body, bagging]","[wwe, 9k9, mycareer, ep9, tyrone, body, baggin...","[mycareer, ep9, tyrone, body, bagging, dudes, ...",16
4,[windstorm],"[i, like, the, weird, ones, like, rain, of, my...","[i, like, the, weird, ones, like, rain, of, my...",24
...,...,...,...,...
7608,"[suicide, bombing]","[suicide, bombing, is, just, the, fear, of, dy...","[suicide, bombing, is, just, the]",5
7609,[collision],"[traffic, collision, -, ambulance, enroute, :,...","[traffic, collision, -, ambulance, enroute, :,...",24
7610,[evacuate],"[@ahhtheenikki, and, from, what, i, can, tell-...","[were, able, to, evacuate, all, the, ppl, so, ...",14
7611,[derailment],"[madhya, pradesh, train, derailment, :, , vill...","[madhya, pradesh, train, derailment, :, , vill...",9


In [268]:
context_df['len_text']=context_df.text.apply(lambda x: len(x))

In [269]:
context_df

Unnamed: 0,keyword,text,context,len,len_text
0,[trauma],"[@raabchar_9, @drphil, @morganlawgrp, how, do,...","[do, you, self-inflict, a, wound, to, your, si...",21,25
1,[emergency],"[anchorage, jobs, emergency, medicine, -, nurs...","[anchorage, jobs, emergency, medicine, -, nurs...",34,37
2,[injury],"[joboozoso, :, , usat, usatoday_nfl, michael, ...","[joboozoso, :, , usat, usatoday_nfl, michael, ...",26,29
3,"[body, bagging]","[wwe, 9k9, mycareer, ep9, tyrone, body, baggin...","[mycareer, ep9, tyrone, body, bagging, dudes, ...",16,23
4,[windstorm],"[i, like, the, weird, ones, like, rain, of, my...","[i, like, the, weird, ones, like, rain, of, my...",24,25
...,...,...,...,...,...
7608,"[suicide, bombing]","[suicide, bombing, is, just, the, fear, of, dy...","[suicide, bombing, is, just, the]",5,9
7609,[collision],"[traffic, collision, -, ambulance, enroute, :,...","[traffic, collision, -, ambulance, enroute, :,...",24,24
7610,[evacuate],"[@ahhtheenikki, and, from, what, i, can, tell-...","[were, able, to, evacuate, all, the, ppl, so, ...",14,30
7611,[derailment],"[madhya, pradesh, train, derailment, :, , vill...","[madhya, pradesh, train, derailment, :, , vill...",9,11


In [270]:
count_dog = collections.defaultdict(int)
for _, row in context_df.iterrows():
    text=row.text
    for word in text:
        if '@' in word:
            count_dog[word]+=1
{k: v for k, v in sorted(count_dog.items(), key=lambda x: x[1], reverse=True)}

{'@youtube': 83,
 '@': 37,
 '@arianagrande': 11,
 '@potus': 9,
 '@foxnews': 9,
 '@usatoday': 9,
 '@change': 9,
 '@emmerdale': 8,
 '@justinbieber': 7,
 '@djicemoon': 7,
 '@stretcher': 6,
 '@towel': 6,
 '@mikeparractor': 6,
 '@invalid': 5,
 '@viralspell': 5,
 '@youngheroesid': 5,
 '@usagov': 5,
 '@ap': 5,
 '@raynbowaffair': 4,
 '@diamondkesawn': 4,
 '@gop': 4,
 '@grazed': 4,
 '@rexyy': 4,
 '@lonewolffur': 4,
 '@local_arsonist': 4,
 '@realdonaldtrump': 4,
 '@michael9sos': 4,
 '@reuters': 4,
 '@kurtschlichter': 4,
 '@unsuckdcmetro': 4,
 '@business': 3,
 '@worldnetdaily': 3,
 '@dannyonpc': 3,
 '@jimmyfallon': 3,
 '@wired': 3,
 '@zak_bagans': 3,
 '@tinyjecht': 3,
 '@un': 3,
 '@refugees': 3,
 '@apollobrown': 3,
 '@nickcannon': 3,
 '@realmandyrain': 3,
 '@itunesmusic': 3,
 '@itunes': 3,
 '@guardian': 3,
 '@trubgme': 3,
 '@witter': 3,
 '@calum9sos': 3,
 '@weathernetwork': 3,
 '@claytonbryant': 3,
 '@accionempresa': 3,
 '@gerenciatodos': 3,
 '@davidvonderhaar': 3,
 '@barackobama': 3,
 '@spinning

In [271]:
''''count_dig = collections.defaultdict(int)
for _, row in context_df.iterrows():
    text=row.text
    for word in text:
        if '9' in word:
            count_dig[word]+=1
{k: v for k, v in sorted(count_dig.items(), key=lambda x: x[1], reverse=True)}'''

"'count_dig = collections.defaultdict(int)\nfor _, row in context_df.iterrows():\n    text=row.text\n    for word in text:\n        if '9' in word:\n            count_dig[word]+=1\n{k: v for k, v in sorted(count_dig.items(), key=lambda x: x[1], reverse=True)}"

In [272]:
context_df['dif_len']=context_df.len_text-context_df.len
context_df['dif_len'].mean()

4.142782083278602

In [273]:
train_split_not_dubl=train_split_df.drop_duplicates(subset=['text'],keep='first')
train_split_not_dubl

Unnamed: 0,id,keyword,location,text,target,split
0,9815,trauma,LOCAL ATLANTA NEWS 4/28/00 - 4/28/15 FREELANCER,@raabchar_9 @drphil @morganlawgrp how do you s...,0,train
1,4527,emergency,"Anchorage, AK",anchorage jobs emergency medicine - nurse prac...,0,train
2,6540,injury,,joboozoso : usat usatoday_nfl michael floyd '...,0,train
3,1456,body%20bagging,,wwe 9k9 mycareer ep9 tyrone body bagging dudes...,0,train
4,10566,windstorm,she/her/your majesty/empress,i like the weird ones like rain of mystical or...,0,train
...,...,...,...,...,...,...
7607,4742,evacuate,,sooo police dispatch said there was a person t...,1,val
7608,9206,suicide%20bombing,,suicide bombing is just the fear of dying alone,1,val
7609,2540,collision,"North Highlands, CA",traffic collision - ambulance enroute : elkho...,1,val
7610,4747,evacuate,Nashville,@ahhtheenikki and from what i can tell- they r...,1,val


In [274]:
train_split_not_dubl.to_csv(args.train_split_csv, index=False)

In [275]:
pd.read_csv(args.train_split_csv)

Unnamed: 0,id,keyword,location,text,target,split
0,9815,trauma,LOCAL ATLANTA NEWS 4/28/00 - 4/28/15 FREELANCER,@raabchar_9 @drphil @morganlawgrp how do you s...,0,train
1,4527,emergency,"Anchorage, AK",anchorage jobs emergency medicine - nurse prac...,0,train
2,6540,injury,,joboozoso : usat usatoday_nfl michael floyd '...,0,train
3,1456,body%20bagging,,wwe 9k9 mycareer ep9 tyrone body bagging dudes...,0,train
4,10566,windstorm,she/her/your majesty/empress,i like the weird ones like rain of mystical or...,0,train
...,...,...,...,...,...,...
7478,4742,evacuate,,sooo police dispatch said there was a person t...,1,val
7479,9206,suicide%20bombing,,suicide bombing is just the fear of dying alone,1,val
7480,2540,collision,"North Highlands, CA",traffic collision - ambulance enroute : elkho...,1,val
7481,4747,evacuate,Nashville,@ahhtheenikki and from what i can tell- they r...,1,val


In [276]:
s='tytyty,<'
'9', 't' not in s

('9', False)

In [277]:
c=0
for p in string.punctuation:
    if p in s:
        c+=1
c

2

In [278]:
train_split_not_dubl.target.sum()/len(train_split_not_dubl)

0.4256314312441534