In [1]:
import numpy as np
import pandas as pd
pd.options.display.precision = 2

In [2]:
import re
import os
os.chdir('../../')

In [3]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = [15, 10]

In [4]:
#import seaborn as sns
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# Read Predictors data

In [5]:
predictors_df = pd.read_csv('data/Primary_Commodity_Price_System_PCPS.csv', skiprows=1)

In [6]:
predictors_df['year'] = predictors_df.Date.apply(lambda x: int(x.split(' ')[1]))
#predictors_df.head()

In [7]:
predictors_yearly_df = predictors_df.groupby('year').mean()
predictors_yearly_df = predictors_yearly_df.join(predictors_yearly_df.shift(), rsuffix='_shifted')
predictors_yearly_df = predictors_yearly_df.loc[2003:2018]

In [8]:
predictors_yearly_df.head()

Unnamed: 0_level_0,Index,Energy index,Non-Fuel index,Coal index,Natural gas index,APSP crude oil_USD_per_bbl,Propane,Industrial Materials index,Food and beverage index,Fertilizer,Index_shifted,Energy index_shifted,Non-Fuel index_shifted,Coal index_shifted,Natural gas index_shifted,APSP crude oil_USD_per_bbl_shifted,Propane_shifted,Industrial Materials index_shifted,Food and beverage index_shifted,Fertilizer_shifted
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2003,65.99,78.15,57.58,43.29,122.67,66.92,118.85,56.87,67.88,63.89,56.14,62.99,51.41,39.49,85.2,62.37,84.21,48.89,62.27,49.79
2004,79.37,98.81,65.94,82.92,134.16,82.5,152.77,72.09,73.98,74.88,65.99,78.15,57.58,43.29,122.67,66.92,118.85,56.87,67.88,63.89
2005,97.94,136.64,71.19,71.92,195.76,117.35,187.74,83.98,75.95,92.23,79.37,98.81,65.94,82.92,134.16,82.5,152.77,72.09,73.98,74.88
2006,114.36,154.33,86.73,76.88,189.1,141.65,208.49,121.26,81.92,89.3,97.94,136.64,71.19,71.92,195.76,117.35,187.74,83.98,75.95,92.23
2007,129.29,169.42,101.54,98.37,193.41,150.35,248.43,144.61,93.37,129.2,114.36,154.33,86.73,76.88,189.1,141.65,208.49,121.26,81.92,89.3


# Read response data (Sponsored Students Counts)

In [9]:
response_df = pd.read_csv("data/sponsor_data_countrywise_backup.csv").set_index('country')
response_df.columns.name = 'year'
response_df = response_df.stack()
response_df.name = 'students_count'
response_df = response_df.to_frame().reset_index()
response_df['year'] = response_df.year.astype(int)
response_df.to_csv('data/sponsored_students_counts.csv', index=False)

In [10]:
mask = response_df.country.apply(lambda x: re.search('.*Total.*', x) is None)
response_df = response_df[mask]

In [11]:
years_df = response_df.groupby('year').students_count.sum().sort_index()

traces = []
traces.append(go.Bar(
        x=years_df.index.values,
        y=years_df.values,
        name='Sponsored Student Count',
        opacity = 0.5))


layout = dict(
    title = 'Total Number of Sponsored Students By Year',
    xaxis = dict(title = 'Year'),
    
    yaxis = dict(
        title = 'Number of Sponsored Students',
        #range = [0, 250]
    ),
    
    legend=dict(
        x=0.3,
        y=1.1,
        traceorder='normal',
        font=dict(
            family='sans-serif',
            size=12,
            color='#000'
        ),
        orientation="h"
    )
,
)

fig = dict(data=traces, layout=layout)
iplot(fig)

In [12]:
top_countries_df = response_df.groupby('country').students_count.sum().sort_values(ascending=False).head(30)
top_countries = top_countries_df.index.values

In [13]:
traces = []
traces.append(go.Bar(
        x=top_countries_df.index.values,
        y=top_countries_df.values,
        name='Sponsored Student Count',
        opacity = 0.5))


layout = dict(
    title = 'Total Number of Sponsored Students By Country',
    xaxis = dict(title = 'Year'),
    
    yaxis = dict(
        title = 'Number of Sponsored Students',
        #range = [0, 250]
    ),
    
    legend=dict(
        x=0.3,
        y=1.1,
        traceorder='normal',
        font=dict(
            family='sans-serif',
            size=12,
            color='#000'
        ),
        orientation="h"
    )
,
)

fig = dict(data=traces, layout=layout)
iplot(fig)

In [14]:
training_set = response_df.join(predictors_yearly_df, on='year')
training_set.to_csv('data/training_set.csv')

In [15]:
training_set.set_index(['country', 'year'], inplace=True)

# Visualise

## Correlation

In [16]:
df = response_df.set_index(['country', 'year']).unstack('country')
df.columns = df.columns.droplevel(0)
df = df.loc[:2018][top_countries[::-1]]

In [17]:
correlation_df = predictors_yearly_df.join(df).corr()
correlation_df = correlation_df[predictors_yearly_df.columns].loc[df.columns.values]
correlation_df

Unnamed: 0,Index,Energy index,Non-Fuel index,Coal index,Natural gas index,APSP crude oil_USD_per_bbl,Propane,Industrial Materials index,Food and beverage index,Fertilizer,Index_shifted,Energy index_shifted,Non-Fuel index_shifted,Coal index_shifted,Natural gas index_shifted,APSP crude oil_USD_per_bbl_shifted,Propane_shifted,Industrial Materials index_shifted,Food and beverage index_shifted,Fertilizer_shifted
Japan,0.19,0.02,0.41,0.304,-0.28,0.13,-0.24,0.195,0.47,0.0802,0.34,0.2,0.51,0.32,-0.09,0.29,0.00712,0.38,0.57,0.09
Azerbaijan,0.13,0.09,0.16,-0.0711,0.03,0.15,-0.15,-0.00897,0.3,0.0015,0.31,0.3,0.31,0.09,0.19,0.37,-0.0354,0.2,0.39,0.11
Maldives,-0.04,-0.12,0.09,0.0561,-0.28,-0.09,-0.21,-0.02,0.11,0.0837,0.31,0.31,0.27,0.52,0.37,0.28,0.223,0.18,0.34,0.53
Kuwait,0.29,0.16,0.45,0.538,-0.14,0.2,0.32,0.48,0.38,0.0844,-0.03,-0.15,0.16,0.16,-0.37,-0.12,-0.119,0.14,0.12,-0.03
Philippines,0.46,0.35,0.56,0.454,0.02,0.39,0.34,0.612,0.48,0.313,0.41,0.38,0.43,0.57,0.25,0.36,0.321,0.39,0.4,0.61
Turkey,0.51,0.58,0.36,0.645,0.65,0.5,0.56,0.326,0.4,0.717,0.27,0.27,0.25,0.2,0.24,0.25,0.373,0.38,0.17,0.23
Pakistan,0.15,0.16,0.12,0.169,0.16,0.11,0.16,0.104,0.1,0.43,0.39,0.47,0.24,0.57,0.62,0.37,0.569,0.27,0.26,0.68
Iran,0.56,0.58,0.49,0.581,0.51,0.53,0.46,0.459,0.44,0.779,0.44,0.42,0.44,0.51,0.34,0.36,0.668,0.59,0.34,0.47
Libya,0.44,0.4,0.45,0.241,0.2,0.41,0.36,0.534,0.36,0.349,0.51,0.56,0.4,0.61,0.57,0.5,0.52,0.42,0.35,0.75
Kazakhstan,0.49,0.35,0.64,0.68,-0.05,0.43,0.25,0.574,0.61,0.249,0.33,0.19,0.5,0.49,-0.11,0.24,0.0698,0.41,0.48,0.31


## Heatmap

In [18]:
trace = go.Heatmap(z=correlation_df.values,
                   x=correlation_df.columns.values,
                   y=correlation_df.index.values)

layout = go.Layout(
    title="Correlation Between the Number of Sponsored Students and Commodity Indices",
    autosize=False,
    width=900,
    height=900)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

## Comparison Chart

In [19]:
predictors = ['Index',
              'Energy index',
              'Non-Fuel index',
              #'Coal index', 'Natural gas index',
       #'APSP crude oil_USD_per_bbl', 'Propane', 'Industrial Materials index',
       #'Food and beverage index', 'Fertilizer'
             ]

In [20]:
traces = []

# Adding predictor lines
for predictor in predictors:
    traces.append(
        go.Scatter(
            x=predictors_yearly_df.index.values,
            y=predictors_yearly_df[predictor].values,
            name=predictor,
            mode='lines'
        ),
    )

# Adding response bar-chart by country
#countries = np.sort(response_df.country.unique())
countries = top_countries
visibility = True
for country in countries:
    df = response_df[response_df.country == country]
    x = df.year.values
    traces.append(go.Bar(
        x=x,
        y=df.students_count,
        yaxis='y2',
        visible=visibility,
        name='Sponsored Student Count',
        opacity = 0.5))
    visibility = False


# Adding dropdown menu    
buttons = []
pr_count = len(predictors)
for idx, county in enumerate(countries):
    mask = [True] * pr_count + [False] * len(countries)
    mask[pr_count + idx] = True
    buttons.append(
        dict(
            label = county,
            method = 'update',
            args = [{'visible': mask}]
        )
    )
    
    
layout = dict(
    title = 'Trends in Commodity Prices and Number of Sponsored Students by Country',
    xaxis = dict(title = 'Year'),
    
    yaxis = dict(
        title = 'Commodity Index Value',
        range = [0, 250]
    ),
    
    yaxis2 = dict(
        title='Sponsored Student Count',
        overlaying='y',
        side='right',
        range=[0,800]
        #position=0.15
    ),
    updatemenus = list([dict(
            active=0,
            pad = {'r': 0, 't': 0},
            x = 0.0,
            xanchor = 'left',
            y = 1.1,
            yanchor = 'top',
            buttons=buttons)]),
    legend=dict(
        x=0.3,
        y=1.1,
        traceorder='normal',
        font=dict(
            family='sans-serif',
            size=12,
            color='#000'
        ),
        orientation="h"
    )
,
)


fig = dict(data=traces, layout=layout)

iplot(fig)