In [None]:
# Data wranggling
import pandas as pd     # Data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as date # To manipulate Dates
import re               # To manipulate regular expressions
from datetime import datetime as dt

# Maths
import numpy as np      # linear algebra
from scipy import stats # Scientific and technical computing  | Statistics module

# Ploting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the "../input/" directory.
# Running this will list the files in the input directory

%xmode Plain
print("Set up completed")

# Any results you write to the current directory are saved as output.

# Exploring our data
We will try to find some relations between cause of death, time

In [None]:
data_raw = pd.read_csv('../input/missing-migrants-project/MissingMigrants-Global-2019-03-29T18-36-07.csv', index_col=['Web ID'],\
                       parse_dates = [2], infer_datetime_format = True)
# data_raw.drop(columns=['Reported Year', 'Reported Month'], inplace = True)
data_raw.rename({'Region of Incident': 'Region'}, axis = 1, inplace = True)
data_raw.index.names = ['id']

data_raw.head()

In [None]:
data_raw.shape

### How much missing data do we have?
From the quick exploration of we notice that some columns have several missing values, this can lead to incorrect conclusions.
<br><br>
Let's check the value of each column creating a table to check the proportion of missing values in each variable to consider it later in our analisies.

In [None]:
"""Let's create a temporal DataFrame for the
   Null observations of the Original DataFrame data_raw"""
# Our Temporal DF
Null_Observations = data_raw.isnull()

# Comparing the total count of rows vs the not NULL observations.
Missing_Data = pd.concat([Null_Observations.sum().sort_values(ascending = False),
                         round((100*Null_Observations.sum()/Null_Observations.count())\
                                .sort_values(ascending = False),2)],
                        axis=1, keys=['Observations Missing', 'Missing (%)'])
# Deleates our temporal
del Null_Observations
# Shows our resoult table
Missing_Data[Missing_Data['Observations Missing']>0]

## Reagrouping cause of death in more general groups

In [None]:
CoD_serie = data_raw['Cause of Death'].str.lower()        # Causes of Death — Lower case.

# Counts the frecuency of each unique value, show series as table, creates a new numberic index,
# Correct column names, show a slice of the table:
CoD_serie.value_counts()\
        .to_frame()\
        .reset_index()\
        .rename(index=str, columns={"index": "Cause of Death",\
                                    "Cause of Death": "Count"})\
        .iloc[:101:10]

The clasifications of death in this dataset are just too many to examine and some of them are repeated or considered different by python because of small differences like capital letters or typhos. To solve this let's grouping the causes of death in more general groups.

### Classifications

In order to reduce the different types of death in at most 10 group. I had to group them in more general groups.
<br> You can take a look of this changes with the CSV file that I in the Karnel or talking a look to the variable classifications.
<br>
<br>Here a glimpse of the 66 substitutions sliced every 8 unique values.

In [None]:
# The patterns I looked for and the substitution are shown in a CSV file.
# Which includes the following rows.
# [replacement]    The value to substitue on the row.
# [pattern]        With the pattern to find in each row

# Read the file in a variable called 'classifications'
classifications = pd.read_csv('../input/missinmigrants-causesofdeatclassification/MissinMigrants-COD_Classifications.csv')
d = {classifications.pattern[i] : classifications.replacement[i] for i in range(classifications.shape[0])}

# Let's take a look at 'classifications'
classifications.iloc[::8]

In [None]:
def like_function(x):
    """This function takes a Series of values and maps the dictionary d into the series using it's key.
    This is simmilar to map a dictionary with str.contains"""
    k_val = "Unknown"
    for key in d:
        if key in x:
            k_val = d[key]
            break
    return k_val

In [None]:
# Apply like_function to substitue the causes of death
cod_reduced = CoD_serie.apply(like_function)

# Count of classifications
data_raw['Cause of Death'] = cod_reduced
temp.value_counts().to_frame()

We only need to substitue this changes in the original data_raw table.
<br> To test the resoult let's agregate the count of deaths per **Cause of death** with the new classification.

We reduced significantly the causes of death to make it easier to process and understand.

In [None]:
data_raw['Cause of Death'].unique()

# Analitics

**Seaborn** is useful examining relationships between different variables, has very good visual tools, are some of the reasons why I choos it as my main tool for this data set

In [None]:
g = sns.factorplot(data=data_raw.dropna(subset=['Number Dead']),
                   x='Region',
                   aspect=1.5,
                   kind="count")
g.set_xticklabels(rotation=90)
plt.show()

The US-Mexico Border is where **more incidents** are taking place. This means only the number of single reports, not the actual number of people affected in this region.
<br><br>
We can take a closer look at the data to know the **number of deaths** by region, and thus the **mean** deaths per incident.

In [None]:
# Create a summary of the cuatitative statistics grouped by region
Incidents_By_Region = data_raw.drop('Source Quality', axis=1).groupby(['Region']).agg('sum')

# Create the 'Count Incidents' columns to count the number of incidents per region.
Incidents_By_Region['Count Incidents'] = data_raw['Region']\
                                        .value_counts(dropna=False)

# Creates an attribute 'Average per incident' which calculates how many people where found dead in average per region.
Incidents_By_Region['Average per incident'] = Incidents_By_Region['Number Dead']/Incidents_By_Region['Count Incidents']

# Let's change the order of this new column so it's the first.
cols = Incidents_By_Region.columns.tolist()
cols = cols[:1] + cols[-2:] + cols[1:-2] # The new column order will be [1,-2, -1, 2:-3]

Incidents_By_Region = Incidents_By_Region[cols]
Incidents_By_Region = Incidents_By_Region.reset_index()

Incidents_By_Region.sort_values('Count Incidents', ascending=False)

By agregating the number of deaths by region we notice that the amount of casualities are more in other regions because of the average deaths per incident.
Let's find out if there is any correlation between the number of incidents and the mumber of deaths.

In [None]:
sns.set(rc={'figure.figsize':(30,10)})

# Regression values
slope, intercept, r_value, p_value, std_err = stats.linregress(Incidents_By_Region['Count Incidents'],Incidents_By_Region['Number Dead'])
# Textbox on the scatterplot
lr_text =("\
y = %(s).2fx + %(i).2f\n\
r = %(r).2f \n\
p = %(p)f\n\
se = %(se).3f" % {'s': slope,\
                'i': intercept,\
                'r': r_value,\
                'p': p_value,\
                'se': std_err})

# Scatter plot
plt.subplot(1,2,1)
sns.scatterplot(x='Count Incidents', y='Number Dead',\
                data=Incidents_By_Region,\
                marker = '+',\
                hue= 'Region')

sns.regplot(x='Count Incidents', y='Number Dead',\
            data=Incidents_By_Region)
# Textbox
plt.text(0, 1500, lr_text)




# Histogram
plt.subplot(1,2,2)
sns.stripplot(x='Region', y='Number Dead',\
                data=data_raw.dropna(subset=['Number Dead']),\
                jitter=True,\
             hue="Region")
plt.xticks(rotation=75)
plt.show()

Statistically speaking the number of incidents and the number of deaths shares a **strong correlation. R= 0.8**. <br><br>
On the other hand, the scatter plot reveals that this is not a reality that the Mediterranean and the US-Mexico border share since the Mediterranean is one of the most dangerous migration routes having 6114.0 reported deaths and 984 incidents with a mean of 6.2 deaths per incident. The US-Mexico Border has a total of 1941.0 deaths, becoming the third place, but also has the highest number of incidents in 1337, thus a mean of 1.45; with this information we could argue that the US-MX border is the safest[1] migration route, but we would require additional information regarding the total attempts of migration each year. 
[1] If we consider the mean deths per incident to be a measure of success.


This strip plot looks odd most of our incidents are around 5 persons per incident, with the exception of Central Asia with only one case of 52 persons missing leaving it's mean at 52.

Furthermore, according to this, some incidents are over 100 missing persons. Two over 200, and one over 700 people. This is outrageous, let's investigate further

In [None]:
data_raw[data_raw['Number Dead'] == data_raw['Number Dead'].max()]['URL']

![Migrant boat captain arrested as survivors of sinking reach Italy](https://i.ibb.co/b7v88dX/Capture.png)

> Police make two arrests as 27 survivors from sinking that killed 800 are brought to Catania on board Italian coastguard ship <

[https://www.theguardian.com/world/2015/apr/21/survivors-800-migrant-boat-disaster-reach-italy-catania](http://)

The loss of so many people this is a tragedy, no doubt. Unfortunately, it happened, thus this observation is correct, we can do something similar to other points we want to verify.

In [None]:
sns.set(rc={'figure.figsize':(40,10)})
g =sns.swarmplot(x='Region', y='Number Dead', hue='Cause of Death', data=data_raw)

#g.set_yscale('log')
plt.show()

In [None]:
data_raw.pivot_table(index='Reported Year',
                     columns='Cause of Death',
                     aggfunc={'Reported Year': np.sum}).plot(figsize=(16.1, 10), kind='bar')
plt.title('Total Dead and Missing by Sex and Age')
plt.ylabel('Deaths by cause')
plt.xlabel('Year of event')
plt.show()

In contrast with the first bar plot in which showed US-Mexican border as the region with more incidents, we can notice now how it's actually very dispersed in a matter of a number of deaths per incident.

This is a persistent difference between regions, the probable cause is because of the way migrants travel. For example, in the US-Mexican people most often will have to walk and maybe get lost in contrast to traveling on a boat through the Mediterranean where a single boat accident can cause the loss of several lives.

Moreover, a quick exploration of the ratios of deaths between males, women and children can reveal how often families travel together.

In [None]:
data_raw[['Region','Number Dead']]\
        .groupby(['Region']).sum().sort_values('Number Dead', ascending = False)

In [None]:
# Draw a nested barplot to show survival for class and sex
g = sns.catplot(x='Region', y='Number Dead',
                data=Incidents_By_Region,
                height=6, kind="bar")

g.despine(left=True)
plt.xticks(rotation=75)
plt.show()

In [None]:
GeoLoc = pd.DataFrame(columns=['Latitud', 'Longitud'])
GeoLoc['Latitud'], GeoLoc['Longitud'] = data_raw['Location Coordinates'].str.split(', ').str
GeoLoc.Latitud = GeoLoc.Latitud.astype(float)
GeoLoc.Longitud = GeoLoc.Longitud.astype(float)
GeoLoc.info()

# Time series
To understand what might be causing so many lost during this exodus we could find useful to visualize the dates where most lives were lost. <br>
For this end we can agregate the total number of incidents per region by month.<br>
<br>
We can use a pivot table to accomplish two of the three Tidy Data principles:
* Columns represent separate variables
* Rows represent individual observations
* Observational units form tables

Tidy data is a standard representation of data better suit for data analisis.


In [None]:
data_raw.pivot_table(index='Reported Year',
                     columns='Region',
                     aggfunc={'Reported Year': np.sum}).plot(figsize=(20, 10), kind='bar')
plt.xlabel('Year')
plt.ylabel('Death count')
plt.title('Deaths by year separated by region', fontsize=20)
plt.show()

In [None]:
DaM_by_DR = data_raw.groupby(['Reported Year', 'Region']).sum().reset_index()
sns.lineplot(x="Reported Year", y="Number Dead", hue='Region', data=DaM_by_DR.reset_index())
plt.xlabel('Year')
plt.ylabel('Death count')
plt.title('Lineplot separated by region')