In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import plotly 
import plotly.plotly as py 
import plotly.figure_factory as ff
from plotly.graph_objs import *
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
import scipy.stats as stats
import math
from sklearn import linear_model
from sklearn.linear_model import SGDRegressor

In [2]:
monthlyAsylumSeekers = pd.read_csv("../data/refugee_monthly.csv", sep=",", engine='python', encoding ='latin1')   
monthlyAsylumSeekers = monthlyAsylumSeekers[monthlyAsylumSeekers['Value'] != '*']
monthlyAsylumSeekers['Value'] = monthlyAsylumSeekers['Value'].apply(pd.to_numeric)
monthlyAsylumSeekers.head()

Unnamed: 0,Country / territory of asylum/residence,Origin,Year,Month,Value
0,Greece,Georgia,2008,March,140
1,Greece,Georgia,2008,April,199
2,Greece,Georgia,2008,May,210
3,Greece,Georgia,2008,June,208
4,Greece,Georgia,2008,July,224


In [3]:
monthlyAsylumSeekers.sort_values(by=['Year'])
monthlyAsylumSeekers.rename(columns={'Country / territory of asylum/residence':'CountryofAsylum'}, inplace=True)
monthlyAsylumSeekers.head()

Unnamed: 0,CountryofAsylum,Origin,Year,Month,Value
0,Greece,Georgia,2008,March,140
1,Greece,Georgia,2008,April,199
2,Greece,Georgia,2008,May,210
3,Greece,Georgia,2008,June,208
4,Greece,Georgia,2008,July,224


In [4]:
def cleanDataSets(df):
    # dropping empty rows 
    df2=df.dropna(subset=['1990','1991','1992','1993','1994','1995','1996','1997','1998','1999','2000','2001','2002',
                 '2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016'], 
                  how='all')
    # dropping empty columns 
    df3=df2.dropna(axis=1, how='all')
    df4=df3.drop('Indicator Code',axis=1)
    df5=df4.drop('Indicator Name',axis=1)
    return df5

In [5]:
GDPByCountry = pd.read_csv("../data/GDP/GDP.csv", sep='\t', engine='python', encoding ='latin1') 
GDP=cleanDataSets(GDPByCountry)
GDP.drop(GDP.columns[[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,
                      31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58]], axis=1, inplace=True)
#GDP.rename_axis("Country Name", axis="columns", inplace=True)
#GDP=GDP.rename_axis('Country Name', axis=1,inplace=True)
#del(GDP.index["Country Name"])
#data.set_index('Locality', inplace=True)
GDP.head()

Unnamed: 0,Country Name,Country Code
0,Aruba,ABW
1,Afghanistan,AFG
2,Angola,AGO
3,Albania,ALB
4,Andorra,AND


In [6]:
GDP.rename(columns={'Country Name':'CountryofAsylum'}, inplace=True)
GDP.head()

Unnamed: 0,CountryofAsylum,Country Code
0,Aruba,ABW
1,Afghanistan,AFG
2,Angola,AGO
3,Albania,ALB
4,Andorra,AND


Join for aslyum country code

In [7]:
Combined=monthlyAsylumSeekers.merge(GDP,on='CountryofAsylum', how='inner')
Combined.rename(columns={'Country Code':'CountryCodeAsylum'}, inplace=True)
Combined.head()

Unnamed: 0,CountryofAsylum,Origin,Year,Month,Value,CountryCodeAsylum
0,Greece,Georgia,2008,March,140,GRC
1,Greece,Georgia,2008,April,199,GRC
2,Greece,Georgia,2008,May,210,GRC
3,Greece,Georgia,2008,June,208,GRC
4,Greece,Georgia,2008,July,224,GRC


Rename to make merge (for origin country code) easier

In [8]:
GDP.rename(columns={'CountryofAsylum':'Origin'}, inplace=True)
GDP.head()

Unnamed: 0,Origin,Country Code
0,Aruba,ABW
1,Afghanistan,AFG
2,Angola,AGO
3,Albania,ALB
4,Andorra,AND


Now merge for origin country code

In [9]:
Combined=Combined.merge(GDP,on='Origin', how='inner')
Combined.rename(columns={'Country Code':'CountryCodeOrigin'}, inplace=True)
Combined.head()

Unnamed: 0,CountryofAsylum,Origin,Year,Month,Value,CountryCodeAsylum,CountryCodeOrigin
0,Greece,Georgia,2008,March,140,GRC,GEO
1,Greece,Georgia,2008,April,199,GRC,GEO
2,Greece,Georgia,2008,May,210,GRC,GEO
3,Greece,Georgia,2008,June,208,GRC,GEO
4,Greece,Georgia,2008,July,224,GRC,GEO


##### Now that we have the country codes for origin and asylum we can easily merge with the distance data set

In [10]:
capitalDistance= pd.read_csv("../data/capdist.csv", sep=',', engine='python', encoding ='latin1') 
capitalDistance.drop(capitalDistance.columns[[0,2,5]], axis=1, inplace=True)
capitalDistance.head()

Unnamed: 0,ida,idb,kmdist
0,USA,CAN,731
1,USA,BHM,1623
2,USA,CUB,1813
3,USA,HAI,2286
4,USA,DOM,2358


rename ida to CountryCodeAsylum and idb to CountryCodeOrigin to make merging easier

In [11]:
capitalDistance.rename(columns={'ida':'CountryCodeAsylum','idb':'CountryCodeOrigin'}, inplace=True)
capitalDistance.head()

Unnamed: 0,CountryCodeAsylum,CountryCodeOrigin,kmdist
0,USA,CAN,731
1,USA,BHM,1623
2,USA,CUB,1813
3,USA,HAI,2286
4,USA,DOM,2358


In [12]:
final=Combined.merge(capitalDistance, on=['CountryCodeAsylum', 'CountryCodeOrigin'], how='inner')
final.head()

Unnamed: 0,CountryofAsylum,Origin,Year,Month,Value,CountryCodeAsylum,CountryCodeOrigin,kmdist
0,Greece,Ghana,2001,February,1,GRC,GHA,4365
1,Greece,Ghana,2001,November,1,GRC,GHA,4365
2,Greece,Ghana,2001,December,5,GRC,GHA,4365
3,Greece,Ghana,2002,January,1,GRC,GHA,4365
4,Greece,Ghana,2002,August,1,GRC,GHA,4365


In [13]:
from sklearn import preprocessing
countryTrainer=preprocessing.LabelEncoder()
originTrainer=preprocessing.LabelEncoder()
monthTrainer=preprocessing.LabelEncoder()

Transform categorical into numerical

In [14]:

monthlyAsylumSeekers['CountryofAsylum']=countryTrainer.fit_transform(monthlyAsylumSeekers['CountryofAsylum'])
monthlyAsylumSeekers['Month']=monthTrainer.fit_transform(monthlyAsylumSeekers['Month'])
monthlyAsylumSeekers['Origin']=originTrainer.fit_transform(monthlyAsylumSeekers['Origin'])
monthlyAsylumSeekers[['CountryofAsylum','Origin', 'Month']].head()

Unnamed: 0,CountryofAsylum,Origin,Month
0,15,68,7
1,15,68,0
2,15,68,8
3,15,68,6
4,15,68,5


In [15]:
from sklearn.utils import shuffle
final = shuffle(final)
final.head()

Unnamed: 0,CountryofAsylum,Origin,Year,Month,Value,CountryCodeAsylum,CountryCodeOrigin,kmdist
37195,Cyprus,Iraq,2013,January,2,CYP,IRQ,1037
5668,Poland,Belarus,2004,October,4,POL,BLR,454
30215,Poland,Peru,2001,June,1,POL,PER,11608
42712,Canada,Jordan,2003,October,9,CAN,JOR,9074
68427,Canada,Nicaragua,2004,March,5,CAN,NIC,3822


Test to see if it works

In [16]:
print monthTrainer.inverse_transform([0,1,2,3,4,5,6,7,8,9,10,11])
print monthTrainer.transform(['March','April','May'])
print countryTrainer.transform(['Greece'])
print originTrainer.transform(['Georgia'])

[u'April' u'August' u'December' u'February' u'January' u'July' u'June'
 u'March' u'May' u'November' u'October' u'September']
[7 0 8]
[15]
[68]


SGD **Regressor** first so y needs to be numerical value, Value.

In [17]:
X=monthlyAsylumSeekers[['CountryofAsylum','Origin','Month']]
y=monthlyAsylumSeekers['Value']
y.head()

0    140
1    199
2    210
3    208
4    224
Name: Value, dtype: int64

Test linear regression (not important, but wanted to see)

In [18]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape, X_test.shape


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.



((372364, 3), (124122, 3))

SGDClassifier to predict where a country of asylum based on value, month, and country of origin

In [22]:
clf = linear_model.SGDRegressor()
clf.fit(X_train, y_train)
clf.score(X_test,y_test)


max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDRegressor'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.



-1.90152856220271e+19

In [20]:
clf.predict([[countryTrainer.transform(['Greece']),originTrainer.transform(['Afghanistan'])[0],monthTrainer.transform(['March'])[0]]])

array([ 33.04489268])