This notebook relates to the preparation of the Categories dataframe

Importing the libraries needed and the dataframe

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import word_tokenize

categories_df = pd.read_csv('data/disaster_categories.csv')

categories_df.head()

Unnamed: 0,id,categories
0,2,related-1;request-0;offer-0;aid_related-0;medi...
1,7,related-1;request-0;offer-0;aid_related-1;medi...
2,8,related-1;request-0;offer-0;aid_related-0;medi...
3,9,related-1;request-1;offer-0;aid_related-1;medi...
4,12,related-1;request-0;offer-0;aid_related-0;medi...


In [2]:
categories_df.shape

(26248, 2)

The dataframe is compose by a single column where the features are written as a string, where each feature is followed by 0 (meaning unrelated) or 1 (related).
We'll have to clean the text inside to isolate the features and transform them into columns, then read the values and assign the values to each column

In [4]:
def only_numbers(cell):
    '''takes an array, cell, and cleans all the non alphanumeric character,
    transforms them into a list of integers and returns that list
    '''
    
    cleaned_cell = re.sub(r'[a-zA-Z_-]+', '', cell)
    cleaned_cell = list(map(int,cleaned_cell.split(';')))
    return cleaned_cell


In [5]:
def cleaning_categories(df,column):
    '''takes in a dataframe where a column is composed by words followed by numbers,
    and transforms it into a dataframe where the words(first row only) are the columns and the numbers are in
    the rows of the respective columns
    Also removes the words who have a zero count, that is the number which followed it was always zero.
    
    column - column to clean (string)
    df - dataframe to clean
    '''
    
    #reading the first row of the dataframe and transforming into an array of words
    column_features = re.sub(r"[-01;]", " ",df[column][0])
    column_features = word_tokenize(column_features )
    
    #creating a temporary dataframe to remove the words and keep the numbers and transform it into a list of lists
    temp_df = df[column]
    fill_df = []    
    for i in range (0,len(temp_df)):    
        fill_df.append(only_numbers(temp_df[i]))
      
    #creating the new dataframe with the expected result
    new_df = pd.DataFrame(np.array(fill_df), columns=[column_features])
    
    #removing the features with only zeros
    new_df = new_df.loc[:,new_df.sum(axis=0)>0]
    
    return new_df

new_df = cleaning_categories(categories_df,'categories')
new_df.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
new_df.shape

(26248, 35)

After some inspection it was detected that there are rows where all categories are zero, so it will be isolated into a new dataframe to inspect later what this means.

In [7]:
zero_rows = new_df[new_df.sum(axis=1) == 0]
zero_rows

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26232,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26241,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


After taking a look at the dataframe that was just created, we can see that there is one feature that contains values greater than 1, specifically all of them are 2. Again this will be isolated into a new dataframe for further inspection.

In [30]:
with pd.option_context('display.max_columns', 40):
    print(new_df.describe(include='all'))

            related       request         offer   aid_related  medical_help  \
count  26248.000000  26248.000000  26248.000000  26248.000000  26248.000000   
mean       0.774002      0.170680      0.004534      0.414432      0.079511   
std        0.435472      0.376236      0.067181      0.492633      0.270540   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        1.000000      0.000000      0.000000      0.000000      0.000000   
50%        1.000000      0.000000      0.000000      0.000000      0.000000   
75%        1.000000      0.000000      0.000000      1.000000      0.000000   
max        2.000000      1.000000      1.000000      1.000000      1.000000   

      medical_products search_and_rescue      security      military  \
count     26248.000000      26248.000000  26248.000000  26248.000000   
mean          0.050061          0.027583      0.017944      0.032764   
std           0.218075          0.163778      0.132751      0.178023   


In [31]:
rows_with_2 = new_df[new_df.iloc[:,0] == 2]
rows_with_2

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
117,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
219,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
305,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
460,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
576,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20351,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20522,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22355,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23411,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
rows_with_2.iloc[:,0] = 0
rows_with_2

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
117,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
219,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
305,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
576,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22355,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23411,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
rows_with_2.sum(axis=0)

related                   0
request                   0
offer                     0
aid_related               0
medical_help              0
medical_products          0
search_and_rescue         0
security                  0
military                  0
water                     0
food                      0
shelter                   0
clothing                  0
money                     0
missing_people            0
refugees                  0
death                     0
other_aid                 0
infrastructure_related    0
transport                 0
buildings                 0
electricity               0
tools                     0
hospitals                 0
shops                     0
aid_centers               0
other_infrastructure      0
weather_related           0
floods                    0
storm                     0
fire                      0
earthquake                0
cold                      0
other_weather             0
direct_report             0
dtype: int64

After the exploration we can see that these rows have all zero values for the remaining features, so again further inspection is needed.

In [32]:
new_df.iloc[:,0].unique()

array([1, 0, 2])