In [12]:
from bokeh.models import ColumnDataSource, HoverTool, LinearAxis, Range1d, FactorRange, CategoricalColorMapper, Legend, LabelSet, Toggle, CustomJS, BoxAnnotation
from bokeh.plotting import figure
from bokeh.io import show, output_file
import pandas as pd
import math
from decimal import Decimal
import numpy
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
import sklearn.metrics as sm
from bokeh.palettes import brewer
from bokeh.palettes import Category20
from operator import itemgetter
from bokeh.layouts import layout



df = pd.read_csv('../data/global_food_prices.csv')
df_r = pd.read_csv('../data/refugee_data.csv')
df_e = pd.read_csv('../data/exchange_rates_to_us_dollar.csv')
df_c = pd.read_csv('../data/Conflict_bbc_compleet.csv')
print("done")

done


# regional info

In [3]:
countries = sorted(list(set(df_c['Country'])))

for country in countries:
    subset = df[df['country']==country]
    if subset.empty:
        continue
    years = sorted(list(set(subset['year'])))
    print('Country: {}'.format(country))
    # get country time data
    first_year = min(years)
    last_year = max(years)
    first_month = min(sorted(list(set(subset[subset['year']==first_year]['month']))))
    last_month = max(sorted(list(set(subset[subset['year']==last_year]['month']))))
    print('Recorded timeframe: 01-0{}-{} to 01-0{}-{}'.format(first_month, first_year, last_month, last_year))

    # product info
    products = sorted(list(set(subset['product'])))           
    product_count = len(products)
    print('Amount of products: {}'.format(product_count))
    max_price = 0
    min_price = 999999
    maxp_product = ''
    minp_product = ''
    max_years = 0
    maxy_product = ''
    for product in products:
        prices = [float(x) for x in list(set(subset[subset['product']==product]['price_in_dollars'])) if x != 'UNKNOWN']
        new_price = sum(prices) / len(prices)
        years = len(list(set(subset[subset['product']==product]['year'])))
        if new_price > max_price: 
            max_price = new_price
            maxp_product = product
        if new_price < min_price:
            minp_product = product
            min_price = new_price
        if years > max_years:
            max_years = years
            maxy_product = product
    if country == 'Afghanistan':
        max_price = max_price / 1000
    print('Longest recorded product: {}'.format(maxy_product))
    print('Most expensive product: {}'.format(maxp_product))
    print('Most expensive product avg price in USD: {}'.format(max_price))
    print('Cheapest product: {}'.format(minp_product))
    print('Cheapest product avg price in USD: {}'.format(min_price))
    print()



Country: Afghanistan
Recorded timeframe: 01-01-2000 to 01-012-2017
Amount of products: 8
Longest recorded product: Wheat
Most expensive product: Livestock (sheep, one-year-old alive female)
Most expensive product avg price in USD: 45.35301287167143
Cheapest product: Wheat
Cheapest product avg price in USD: 0.34511699385059785

Country: Iraq
Recorded timeframe: 01-01-2012 to 01-012-2016
Amount of products: 5
Longest recorded product: Bread (khoboz)
Most expensive product: Oil (vegetable)
Most expensive product avg price in USD: 1.8388194219579694
Cheapest product: Bread (khoboz)
Cheapest product avg price in USD: 0.1305758204255329

Country: Lebanon
Recorded timeframe: 01-01-2012 to 01-012-2016
Amount of products: 23
Longest recorded product: Beans (white)
Most expensive product: Milk (powder)
Most expensive product avg price in USD: 8.268529001579015
Cheapest product: Salt
Cheapest product avg price in USD: 0.36248049323587106

Country: Pakistan
Recorded timeframe: 01-01-2004 to 01-012

In [31]:
def correlation_valuta(country, product):
    subset_prices = df[(df['country']==country) & (df['product']==product)]
    if subset.empty:
        return False
    currency = list(subset_prices['currency'])[0]
    subset_valuta = df_e[df_e['currency']==currency]
    years_prices = sorted(list(set(subset_prices['year'])))
    years_valuta = sorted(list(set(subset_valuta['year'])))
    years = sorted(list(set(years_prices).intersection(years_valuta)))
    subset_prices = subset_prices[subset_prices['year'].isin(years)]
    subset_valuta = subset_valuta[subset_valuta['year'].isin(years)]
    valuta_values = []
    price_values = []
    time_values_y = []
    time_values_m = []
    for year in years:
        subset_prices_year = subset_prices[subset_prices['year']==year]
        subset_valuta_year = subset_valuta[subset_valuta['year']==year]
        for i in range(12):
            month = i + 1
            time_values_y.append(year)
            time_values_m.append(month)
            subset_prices_month = subset_prices_year[subset_prices_year['month']==month]
            subset_valuta_month = subset_valuta_year[subset_valuta_year['month']==month]
            if subset_prices_month.empty:
                price_values.append(None)
            else:
                price = list(subset_prices_month['avg_price'])[0]
                if price == 'UNKNOWN':
                    price_values.append(None)
                else:
                    price_values.append(round(float(price), 2))
                    
            if subset_valuta_month.empty:
                valuta_values.append(None)
            else:
                valuta = list(subset_valuta_month['value'])[0]
                if valuta == 'UNKNOWN':
                    valuta_values.append(None)
                else:
                    valuta_values.append(round(1/float(valuta), 5))
    valuta_values = interpolate(valuta_values)
    price_values = interpolate(price_values)
    data = zip(valuta_values, price_values, time_values_y, time_values_m)
    data = sorted(data, key=lambda data: data[0])
    return list(zip(*data))

# Product price vs valuta value graph

In [32]:
country = 'Afghanistan'
product = 'Bread'

currency = df[df['country']==country].currency.unique()[0]

data = correlation_valuta(country, product)
x, y, time_y, time_m = data

col_df = pd.DataFrame()
col_df['rate'] = x
col_df['price'] = y
col_df['year'] = time_y
col_df['month'] = time_m


colors = Category20[len(col_df.year.unique())]
colormap = {year: colors[i] for i, year in enumerate(col_df.year.unique())}
colors = [colormap[x] for x in col_df.year]

col_df['color'] = colors
src = ColumnDataSource(col_df)

p = figure(height=500, width=800)
p.circle('rate', 'price', source=src, size=5, color='color', legend='year')
legend_width = (min(x) - max(x)) * 0.2
p.y_range = Range1d(min(y), max(y))
p.title.text = "Relation {} price with {} currency value in USD".format(product, country) 
p.xaxis.axis_label="{} value in USD".format(currency)
p.yaxis.axis_label="Bread Price in {}".format(currency)
p.x_range = Range1d(max(x), min(x) + legend_width)
h = HoverTool(tooltips = [('Year', '@year'), ('Month', '@month')])
p.add_tools(h)

show(p)

print(correlation(x, y))

-0.6007688922911321


### Extension with Kclusters

In [6]:
k_df = col_df[['rate', 'price']]
col_df2 = col_df

model = KMeans(n_clusters=4)
model.fit(k_df)
col_df2['cluster_data'] = model.labels_
colors = Category20[len(col_df2.cluster_data.unique())]
colormap = {cluster: colors[i] for i, cluster in enumerate(col_df2.cluster_data.unique())}
colors = [colormap[x] for x in col_df2.cluster_data]

col_df['color'] = colors
src = ColumnDataSource(col_df)

p = figure(height=500, width=800)
p.circle('rate', 'price', source=src, size=5, color='color')
legend_width = (min(x) - max(x)) * 0.2
p.y_range = Range1d(min(y), max(y))
p.title.text = "Clustered using kClustering: Relation {} price with {} currency value in USD".format(product, country) 
p.xaxis.axis_label="{} value in USD".format(currency)
p.yaxis.axis_label="Bread Price in {}".format(currency)
p.x_range = Range1d(max(x), min(x) + legend_width)
h = HoverTool(tooltips = [('Year', '@year'), ('Month', '@month')])
p.add_tools(h)

show(p)


### Extension with regression

In [33]:
X = np.vstack(x)
X = np.column_stack((X, np.ones(X.shape[0])))
Y = y
a, b = np.linalg.lstsq(X, Y)[0]

xl = col_df['rate']

rb = y
xb = xl
yb = a * xb + b

xl = list(xl)

mse = sum((rb-yb) **2) / len(xb)
print("Mean Squared Error: {}.".format(mse))
rmse = np.sqrt(mse)
print("Root Mean Squared Error: {}".format(rmse))
print("{} {} price expected to be within {} {} of predicted line fit.".format(country, product, round(2*rmse, 2), currency))

linr = pd.DataFrame()
xl = [min(xl) / 2] + xl
linr['data'] = xl
xl = linr['data']

p = figure(height=500, width=800)
p.circle('rate', 'price', source=src, size=5)
legend_width = (min(x) - max(x)) * 0.2
p.y_range = Range1d(min(y), max(y)*1.1)
p.title.text = "Linear Regression: Relation {} price with {} currency value in USD".format(product, country) 
p.xaxis.axis_label="{} value in USD".format(currency)
p.yaxis.axis_label="Bread Price in {}".format(currency)
p.line(xl, a * xl + b, color='red')
p.x_range = Range1d(max(xl), min(xl) + legend_width)

show(p)

Mean Squared Error: 0.6021704724981778.
Root Mean Squared Error: 0.7759964384571477
Afghanistan Bread price expected to be within 1.55 AFN of predicted line fit.


# Valuta Value + Product price over time graph

In [34]:
s_data = zip(x, y, time_y, time_m)
s_data = sorted(s_data, key=itemgetter(2,3))
x2, y2, time_y2, time_m2 = list(zip(*s_data))
time_y2 = [str(x) for x in time_y2]
time_m2 = [str(x) for x in time_m2]
x_axis = list(zip(time_y2, time_m2))
l = len(list(set(time_y2)))
width = int(l * (1000/l))
p = figure(height = 300, width=width, x_range=FactorRange(*x_axis))
p.yaxis.axis_label = 'Price in {}'.format(currency)
p.y_range = Range1d(min(y2), max(y2))
p.extra_y_ranges = {'valuta': Range1d(start=min(x2), end=max(x2))}
p.add_layout(LinearAxis(y_range_name='valuta', axis_label='{} value in USD'.format(currency)), 'right')
p.xaxis.axis_label_text_font_size = "10pt"
p.xaxis.major_label_text_font_size = "5pt"
p.xaxis.axis_label = "Date"
p.title.text = "{} {} price in {}, and {} value in USD over time.".format(country, product, currency, currency)
p.line(x_axis, x2, y_range_name='valuta', color='orange', legend='{} value in USD'.format(currency))
p.line(x_axis, y2, legend='Price in {}'.format(currency))

year_start = int(x_axis[0][0])
label_year = 2015
label_month = 11
label_loc = ((label_year - year_start) * 12) + label_month + 1.4*(label_year - year_start) - 0.5

label_source = pd.DataFrame(dict(time=[label_loc], value=[44], text=['A new Taliban splinter group,\n headed by Mullah Rasool, announces its presence in southern Afghanistan.']))

labels = LabelSet(x='time', y='value', text='text', level='glyph', x_offset=5, y_offset=5, 
                  source=ColumnDataSource(label_source), render_mode='css', border_line_color='black', 
                  background_fill_color='white', background_fill_alpha=1)
p.add_layout(labels)


code = '''\
if toggle.active
    object.background_fill_alpha = 0
    object.text_alpha = 0
    object.border_line_alpha = 0
    console.log 'enabling annotations'
else
    object.background_fill_alpha = 1
    object.text_alpha = 1
    object.border_line_alpha = 1
    console.log 'disabling annotations'
'''

callback1 = CustomJS.from_coffeescript(code=code, args={})
toggle1 = Toggle(label="label", button_type="success", callback=callback1)
callback1.args = {'toggle': toggle1, 'object': labels}

show(layout([p], [toggle1]))


# TODO
# Add conflicts to plot

# Helper Fuctions

In [6]:
def interpolate(values):
    for i in range(len(values)):
        if values[i] == None:
            ii = i
            x = None
            while x == None:
                ii -= 1
                if ii < 0: x = values[i]
                x = values[ii]
            ii = i
            y = None
            while y == None:
                ii -= 1
                if ii >= len(values): y = values[i]
                y = values[ii]
            values[i] = (x + y) / 2
    return values

def average(x):
    return float(sum(x)) / len(x)

def correlation(x, y):
    n = len(x)
    avg_x = average(x)
    avg_y = average(y)
    dif = 0
    xdif2 = 0
    ydif2 = 0
    for i in range(n):
        xdif = x[i] - avg_x
        ydif = y[i] - avg_y
        dif += xdif * ydif
        xdif2 += xdif * xdif
        ydif2 += ydif * ydif
    return dif / math.sqrt(xdif2 * ydif2)