# Pulling Training Data




Used data from the following sources:
http://crisislex.org/data-collections.html#CrisisLexT6

In [1]:
import pandas as pd

### Oklahoma Tornado

In [2]:
# Pull data from the CrisisLex repo
ok_tornado_df = pd.read_csv("https://raw.githubusercontent.com/sajao/CrisisLex/master/data/CrisisLexT6/2013_Oklahoma_Tornado/2013_Oklahoma_Tornado-ontopic_offtopic.csv")

In [3]:
ok_tornado_df['disaster'] = 'tornado'

In [4]:
ok_tornado_df.head()

Unnamed: 0,tweet id,tweet,label,disaster
0,'336908711324962817',@HeatleyJheat44 its barley even raining where ...,on-topic,tornado
1,'337052158035890176',Sorry I can't do anything right.,off-topic,tornado
2,'339338021751103488',@mrwendell29: @BradSowderWX says we have the ...,on-topic,tornado
3,'336339509077762051',#honestyhour I like to wear half split running...,off-topic,tornado
4,'337734129972035584',I'm too stressed to have a good summer,off-topic,tornado


In [37]:
ok_tornado_df.columns = ['tweet id', 'tweet', 'label', 'disaster']

In [38]:
# Save the tweets dataframe to a csv file
ok_tornado_df.to_csv('../datasets/train_ok_tornado.csv', index=False)

### Hurricane Sandy

In [6]:
# Pull data from the CrisisLex repo
hurricane_sandy_df = pd.read_csv("https://raw.githubusercontent.com/sajao/CrisisLex/master/data/CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv")

In [7]:
hurricane_sandy_df['disaster'] = 'hurricane'

In [8]:
hurricane_sandy_df.head()

Unnamed: 0,tweet id,tweet,label,disaster
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,off-topic,hurricane
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,on-topic,hurricane
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,off-topic,hurricane
3,'263422851133079552',@taos you never got that magnificent case of B...,off-topic,hurricane
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",off-topic,hurricane


In [39]:
hurricane_sandy_df.columns = ['tweet id', 'tweet', 'label', 'disaster']

In [9]:
# Save the tweets dataframe to a csv file
hurricane_sandy_df.to_csv('../datasets/train_hurricane_sandy.csv', index=False)

### Colorado Floods

In [10]:
# Pull data from the CrisisLex repo
co_flood_df = pd.read_csv("https://raw.githubusercontent.com/sajao/CrisisLex/master/data/CrisisLexT26/2013_Colorado_floods/2013_Colorado_floods-tweets_labeled.csv")

In [11]:
co_flood_df.head()

Unnamed: 0,Tweet ID,Tweet Text,Information Source,Information Type,Informativeness
0,376843697943769088,#Longmont #CO The Tiny Tim Center is now #hiri...,Not labeled,Not labeled,Not related
1,378011169883037697,"RT @dlfluegge: Crazy Flooding in Boulder, Colo...",Media,Sympathy and support,Related - but not informative
2,378020179214491649,Here's the #boulderflood video that's circulat...,Outsiders,Other Useful Information,Related and informative
3,378026101588496385,RT @passantino: Video: Severe flooding hits ne...,Media,Other Useful Information,Related and informative
4,378029784204206080,"Crazy Flooding in Boulder, Colorado http://t.c...",Media,Other Useful Information,Related and informative


In [12]:
co_flood_df[' Informativeness'].value_counts()

Related and informative          768
Related - but not informative    157
Not related                       70
Not applicable                     5
Name:  Informativeness, dtype: int64

In [13]:
# Create a new column for 'label'
co_flood_df['label'] = co_flood_df[' Informativeness'].replace(to_replace=('Related and informative', 
                                                                           'Related - but not informative',
                                                                           'Not related',
                                                                           'Not applicable'
                                                                          ), 
                                                               value=('on-topic',
                                                                      'off-topic',
                                                                      'off-topic',
                                                                      'off-topic',
                                                                     ))

In [14]:
# Drop unused columns
co_flood_df.drop(columns=[' Information Source', ' Information Type', ' Informativeness'], inplace=True)

In [15]:
# rename columns so that it's consistent with the other disaster files
co_flood_df.columns = ['tweet id', 'tweet', 'label']

In [16]:
co_flood_df['disaster'] = 'flood'

In [17]:
co_flood_df.head()

Unnamed: 0,tweet id,tweet,label,disaster
0,376843697943769088,#Longmont #CO The Tiny Tim Center is now #hiri...,off-topic,flood
1,378011169883037697,"RT @dlfluegge: Crazy Flooding in Boulder, Colo...",off-topic,flood
2,378020179214491649,Here's the #boulderflood video that's circulat...,on-topic,flood
3,378026101588496385,RT @passantino: Video: Severe flooding hits ne...,on-topic,flood
4,378029784204206080,"Crazy Flooding in Boulder, Colorado http://t.c...",on-topic,flood


In [18]:
# Save the tweets dataframe to a csv file
co_flood_df.to_csv('../datasets/train_co_flood.csv', index=False)

### Colorado Fires

In [19]:
# Pull data from the CrisisLex repo
co_fire_df = pd.read_csv("https://raw.githubusercontent.com/sajao/CrisisLex/master/data/CrisisLexT26/2012_Colorado_wildfires/2012_Colorado_wildfires-tweets_labeled.csv")

In [20]:
co_fire_df.head()

Unnamed: 0,Tweet ID,Tweet Text,Information Source,Information Type,Informativeness
0,211040709124440064,#Intern #US #TATTOO #Wisconsin #Ohio #NC #PA #...,Not labeled,Not labeled,Not related
1,211111710294163457,RT @Jack4Ward: Get in on the fun every Thursda...,Not labeled,Not labeled,Not related
2,211157222699433985,Welcome to our newest STUDENTathlete- Reagan B...,Not labeled,Not labeled,Not related
3,211162553659830272,Denver Post: #Colorado governor signs bill cre...,Not labeled,Not labeled,Not related
4,211216962162933761,Pretty sure I'm going to live in Manitou Sprin...,Not labeled,Not labeled,Not related


In [21]:
co_fire_df[' Informativeness'].value_counts()

Related and informative          685
Related - but not informative    268
Not related                      238
Not applicable                     9
Name:  Informativeness, dtype: int64

In [22]:
# Create a new column for 'label'
co_fire_df['label'] = co_fire_df[' Informativeness'].replace(to_replace=('Related and informative', 
                                                                           'Related - but not informative',
                                                                           'Not related',
                                                                           'Not applicable'
                                                                          ), 
                                                               value=('on-topic',
                                                                      'off-topic',
                                                                      'off-topic',
                                                                      'off-topic',
                                                                     ))

In [23]:
# Drop unused columns
co_fire_df.drop(columns=[' Information Source', ' Information Type', ' Informativeness'], inplace=True)

In [24]:
# rename columns so that it's consistent with the other disaster files
co_fire_df.columns = ['tweet id', 'tweet', 'label']

In [25]:
co_fire_df['disaster'] = 'fire'

In [26]:
co_fire_df.head()

Unnamed: 0,tweet id,tweet,label,disaster
0,211040709124440064,#Intern #US #TATTOO #Wisconsin #Ohio #NC #PA #...,off-topic,fire
1,211111710294163457,RT @Jack4Ward: Get in on the fun every Thursda...,off-topic,fire
2,211157222699433985,Welcome to our newest STUDENTathlete- Reagan B...,off-topic,fire
3,211162553659830272,Denver Post: #Colorado governor signs bill cre...,off-topic,fire
4,211216962162933761,Pretty sure I'm going to live in Manitou Sprin...,off-topic,fire


In [27]:
# Save the tweets dataframe to a csv file
co_fire_df.to_csv('../datasets/train_co_fire.csv', index=False)

### Combine the DataFrames

In [40]:
disasters_df = pd.concat([ok_tornado_df, hurricane_sandy_df, co_fire_df, co_flood_df], ignore_index=True)

disasters_df.shape

(22200, 4)

In [44]:
disasters_df.head()

Unnamed: 0,tweet id,tweet,label,disaster
0,'336908711324962817',@HeatleyJheat44 its barley even raining where ...,on-topic,tornado
1,'337052158035890176',Sorry I can't do anything right.,off-topic,tornado
2,'339338021751103488',@mrwendell29: @BradSowderWX says we have the ...,on-topic,tornado
3,'336339509077762051',#honestyhour I like to wear half split running...,off-topic,tornado
4,'337734129972035584',I'm too stressed to have a good summer,off-topic,tornado


In [43]:
# Save the tweets dataframe to a csv file
disasters_df.to_csv('../datasets/train_disasters.csv', index=False)