# Import Libraries & load data

In [1]:
import pandas as pd

In [2]:
df_msg = pd.read_csv("disaster_messages.csv")
df_cat = pd.read_csv("disaster_categories.csv")

df = pd.concat([df_msg, df_cat.drop("id", axis = 1)], axis = 1)

In [3]:
df.head()

Unnamed: 0,id,message,original,genre,categories
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,related-1;request-0;offer-0;aid_related-0;medi...
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,related-1;request-0;offer-0;aid_related-1;medi...
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,related-1;request-0;offer-0;aid_related-0;medi...
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,related-1;request-1;offer-0;aid_related-1;medi...
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,related-1;request-0;offer-0;aid_related-0;medi...


# First Look in the data

#### General info

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26248 entries, 0 to 26247
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          26248 non-null  int64 
 1   message     26248 non-null  object
 2   original    10184 non-null  object
 3   genre       26248 non-null  object
 4   categories  26248 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.0+ MB


#### Missing values

In [5]:
df.isnull().sum()

id                0
message           0
original      16064
genre             0
categories        0
dtype: int64

#### General content of the genres

In [6]:
df.groupby("genre")["categories"].count()

genre
direct    10782
news      13068
social     2398
Name: categories, dtype: int64

In [7]:
for genre in df["genre"].unique():
    print(genre)
    print(df[df["genre"]==genre][["message", "categories"]].head(), "\n","#"*50, "\n")

direct
                                             message  \
0  Weather update - a cold front from Cuba that c...   
1            Is the Hurricane over or is it not over   
2                    Looking for someone but no name   
3  UN reports Leogane 80-90 destroyed. Only Hospi...   
4  says: west side of Haiti, rest of the country ...   

                                          categories  
0  related-1;request-0;offer-0;aid_related-0;medi...  
1  related-1;request-0;offer-0;aid_related-1;medi...  
2  related-1;request-0;offer-0;aid_related-0;medi...  
3  related-1;request-1;offer-0;aid_related-1;medi...  
4  related-1;request-0;offer-0;aid_related-0;medi...   
 ################################################## 

social
                                                message  \
9902  My thoughts and prayers go out to all the live...   
9903  I m sorry for the poor people in Haiti tonight...   
9904  RT selenagomez UNICEF has just announced an em...   
9905  lilithia yes 5.2 magni

In [8]:
df_categories = df['categories'].str.split(';', expand=True)
col_names = [col.split("-")[0] for col in df_categories.iloc[0]]
df_categories.columns = col_names

for col in df_categories:
    df_categories[col] = df_categories[col].str.split("-").str[1]

df_categories = df_categories.astype("int")


In [9]:
df_categories

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26244,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26245,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26246,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df = df.drop("categories", axis = 1)

In [11]:
df_expanded = pd.concat([df, df_categories], axis = 1)

In [12]:
df_expanded.shape

(26248, 40)

In [13]:
df_expanded_clean = df_expanded.drop_duplicates()

In [14]:
df_expanded_clean.shape

(26216, 40)

In [16]:
df_expanded_clean["related"].value_counts()

related
1    19906
0     6122
2      188
Name: count, dtype: int64

In [21]:
df_expanded_clean.drop(df_expanded_clean[df_expanded_clean["related"] == 2].index, axis = 0).shape

(26028, 40)

In [22]:
df_expanded_clean.shape

(26216, 40)