In [166]:
from bokeh.models import ColumnDataSource, HoverTool, LinearAxis, Range1d, FactorRange, CategoricalColorMapper, Legend, Label, Toggle, CustomJS, BoxAnnotation, Span, Title
from bokeh.plotting import figure
from bokeh.io import show, output_file
import pandas as pd
import math
from decimal import Decimal
import numpy
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
import sklearn.metrics as sm
from bokeh.palettes import brewer
from bokeh.palettes import Category20
from operator import itemgetter
from bokeh.layouts import layout
import random



df = pd.read_csv('../data/global_food_prices.csv')
df_r = pd.read_csv('../data/refugee_data.csv')
df_e = pd.read_csv('../data/exchange_rates_to_us_dollar.csv')
df_c = pd.read_csv('../data/Conflict_bbc_same_name.csv')
print("done")


done


# regional info

In [154]:
countries = sorted(list(set(df_c['Country'])))

for country in countries:
    subset = df[df['country']==country]
    if subset.empty:
        continue
    years = sorted(list(set(subset['year'])))
    print('Country: {}'.format(country))
    # get country time data
    first_year = min(years)
    last_year = max(years)
    first_month = min(sorted(list(set(subset[subset['year']==first_year]['month']))))
    last_month = max(sorted(list(set(subset[subset['year']==last_year]['month']))))
    print('Recorded timeframe: 01-0{}-{} to 01-0{}-{}'.format(first_month, first_year, last_month, last_year))

    # product info
    products = sorted(list(set(subset['product'])))           
    product_count = len(products)
    print('Amount of products: {}'.format(product_count))
    max_price = 0
    min_price = 999999
    maxp_product = ''
    minp_product = ''
    max_years = 0
    maxy_product = ''
    for product in products:
        prices = [float(x) for x in list(set(subset[subset['product']==product]['price_in_dollars'])) if x != 'UNKNOWN']
        if len(prices) > 0:
            new_price = sum(prices) / len(prices)
        else:
            continue 
        years = len(list(set(subset[subset['product']==product]['year'])))
        if new_price > max_price: 
            max_price = new_price
            maxp_product = product
        if new_price < min_price:
            minp_product = product
            min_price = new_price
        if years > max_years:
            max_years = years
            maxy_product = product
    if country == 'Afghanistan':
        max_price = max_price / 1000
    print('Longest recorded product: {}'.format(maxy_product))
    print('Most expensive product: {}'.format(maxp_product))
    print('Most expensive product avg price in USD: {}'.format(max_price))
    print('Cheapest product: {}'.format(minp_product))
    print('Cheapest product avg price in USD: {}'.format(min_price))
    print()



Country: Afghanistan
Recorded timeframe: 01-01-2000 to 01-012-2017
Amount of products: 8
Longest recorded product: Wheat
Most expensive product: Livestock (sheep, one-year-old alive female)
Most expensive product avg price in USD: 45.35301287167144
Cheapest product: Wheat
Cheapest product avg price in USD: 0.3451169938505977

Country: Iran  (Islamic Republic of)
Recorded timeframe: 01-01-2012 to 01-012-2017
Amount of products: 5
Longest recorded product: Eggs
Most expensive product: Eggs
Most expensive product avg price in USD: 3.830799476483619
Cheapest product: Sugar
Cheapest product avg price in USD: 1.1091221688053439

Country: Iraq
Recorded timeframe: 01-01-2012 to 01-012-2016
Amount of products: 5
Longest recorded product: Bread (khoboz)
Most expensive product: Oil (vegetable)
Most expensive product avg price in USD: 1.8388194219579694
Cheapest product: Bread (khoboz)
Cheapest product avg price in USD: 0.1305758204255329

Country: Lebanon
Recorded timeframe: 01-01-2012 to 01-012-

In [126]:
def correlation_valuta(country, product):
    subset_prices = df[(df['country']==country) & (df['product']==product)]
    if subset.empty:
        return False
    currency = list(subset_prices['currency'])[0]
    subset_valuta = df_e[df_e['currency']==currency]
    years_prices = sorted(list(set(subset_prices['year'])))
    years_valuta = sorted(list(set(subset_valuta['year'])))
    years = sorted(list(set(years_prices).intersection(years_valuta)))
    subset_prices = subset_prices[subset_prices['year'].isin(years)]
    subset_valuta = subset_valuta[subset_valuta['year'].isin(years)]
    valuta_values = []
    price_values = []
    time_values_y = []
    time_values_m = []
    for year in years:
        subset_prices_year = subset_prices[subset_prices['year']==year]
        subset_valuta_year = subset_valuta[subset_valuta['year']==year]
        for i in range(12):
            month = i + 1
            time_values_y.append(year)
            time_values_m.append(month)
            subset_prices_month = subset_prices_year[subset_prices_year['month']==month]
            subset_valuta_month = subset_valuta_year[subset_valuta_year['month']==month]
            if subset_prices_month.empty:
                price_values.append(None)
            else:
                price = list(subset_prices_month['avg_price'])[0]
                if price == 'UNKNOWN':
                    price_values.append(None)
                else:
                    price_values.append(round(float(price), 2))
                    
            if subset_valuta_month.empty:
                valuta_values.append(None)
            else:
                valuta = list(subset_valuta_month['value'])[0]
                if valuta == 'UNKNOWN':
                    valuta_values.append(None)
                else:
                    valuta_values.append(round(1/float(valuta), 5))
    valuta_values = interpolate(valuta_values)
    price_values = interpolate(price_values)
    data = zip(valuta_values, price_values, time_values_y, time_values_m)
    data = sorted(data, key=lambda data: data[0])
    return list(zip(*data))

# Product price vs valuta value graph

In [167]:
country = 'Yemen'
product = 'Wheat'

output_file('../graphs/question3/product_price_valuta_value_{}_{}.html'.format(country, product))

currency = df[df['country']==country].currency.unique()[0]

data = correlation_valuta(country, product)
x, y, time_y, time_m = data

col_df = pd.DataFrame()
col_df['rate'] = x
col_df['price'] = y
col_df['year'] = time_y
col_df['month'] = time_m


colors = Category20[len(col_df.year.unique())]
colormap = {year: colors[i] for i, year in enumerate(col_df.year.unique())}
colors = [colormap[x] for x in col_df.year]

col_df['color'] = colors
src = ColumnDataSource(col_df)
tools = "pan,wheel_zoom,box_zoom,reset"

p = figure(height=500, width=800, tools=tools)
p.circle('rate', 'price', source=src, size=5, color='color', legend='year')
legend_width = (min(x) - max(x)) * 0.2
p.y_range = Range1d(min(y), max(y))
p.title.text = "Relation {} price with {} currency value in USD".format(product, country) 
p.xaxis.axis_label="{} value in USD".format(currency)
p.yaxis.axis_label="Bread Price in {}".format(currency)
p.x_range = Range1d(max(x), min(x) + legend_width)
h = HoverTool(tooltips = [('Year', '@year'), ('Month', '@month')])
cor = correlation(x, y)
p.add_layout(Title(text='correlation: {}'.format(round(cor,2)), align='center'), 'below') 
p.add_tools(h)
p.toolbar.logo = None
p.toolbar_location = 'above'


show(p)





### Extension with Kclusters

In [169]:
k_df = col_df[['rate', 'price']]
col_df2 = col_df
cluster_count=9
model = KMeans(n_clusters=cluster_count)
model.fit(k_df)
col_df2['cluster_data'] = model.labels_
colors = Category20[len(col_df2.cluster_data.unique())]
colormap = {cluster: colors[i] for i, cluster in enumerate(col_df2.cluster_data.unique())}
colors = [colormap[x] for x in col_df2.cluster_data]

output_file('../graphs/question3/clustering_product_price_valuta_value_{}_{}.html'.format(country, product))

col_df['color'] = colors
src = ColumnDataSource(col_df)

tools = "pan,wheel_zoom,box_zoom,reset"
p = figure(height=500, width=800, tools=tools)
p.circle('rate', 'price', source=src, size=5, color='color')
legend_width = (min(x) - max(x)) * 0.2
p.y_range = Range1d(min(y), max(y))
p.title.text = "Clustering: Relation {} price with {} currency value in USD".format(product, country) 
p.xaxis.axis_label="{} value in USD".format(currency)
p.yaxis.axis_label="Bread Price in {}".format(currency)
p.x_range = Range1d(max(x), min(x) + legend_width)
h = HoverTool(tooltips = [('Year', '@year'), ('Month', '@month')])
p.add_layout(Title(text='Amount of clusters: {}'.format(cluster_count), align='center'), 'below') 
p.add_tools(h)
p.toolbar.logo = None
p.toolbar_location = 'above'
show(p)




### Extension with regression

In [170]:
X = np.vstack(x)
X = np.column_stack((X, np.ones(X.shape[0])))
Y = y
a, b = np.linalg.lstsq(X, Y)[0]

xl = col_df['rate']
output_file('../graphs/question3/regression_product_price_valuta_value_{}_{}.html'.format(country, product))

rb = y
xb = xl
yb = a * xb + b

xl = list(xl)

mse = sum((rb-yb) **2) / len(xb)
desc = "Mean Squared Error: {}".format(round(mse, 2))
rmse = np.sqrt(mse)
desc += " - Root Mean Squared Error: {}".format(round(rmse, 2))
print(desc)
linr = pd.DataFrame()
xl = [min(xl) / 2] + xl
linr['data'] = xl
xl = linr['data']

tools = "pan,wheel_zoom,box_zoom,reset"
p = figure(height=500, width=800, tools=tools)
p.circle('rate', 'price', source=src, size=5)
legend_width = (min(x) - max(x)) * 0.2
p.y_range = Range1d(min(y), max(y)*1.1)
p.title.text = "Linear Regression: Relation {} price with {} currency value in USD".format(product, country) 
p.xaxis.axis_label="{} value in USD".format(currency)
p.yaxis.axis_label="Bread Price in {}".format(currency)
p.line(xl, a * xl + b, color='red')
p.line(xl, a * xl + b + rmse, color='red', line_alpha=0.4, line_dash = 'dashed')
p.line(xl, a * xl + b - rmse, color='red', line_alpha=0.4, line_dash = 'dashed')
p.add_layout(Title(text=desc, align='center'), 'below') 
p.x_range = Range1d(max(xl), min(xl) + legend_width)
p.toolbar.logo = None
p.toolbar_location = 'above'
show(p)

Mean Squared Error: 949.33 - Root Mean Squared Error: 30.81




# Valuta Value + Product price over time graph

In [173]:
output_file('../graphs/question3/over_time_product_price_valuta_value_{}_{}.html'.format(country, product))


s_data = zip(x, y, time_y, time_m)
s_data = sorted(s_data, key=itemgetter(2,3))
x2, y2, time_y2, time_m2 = list(zip(*s_data))
time_y2 = [str(x) for x in time_y2]
time_m2 = [str(x) for x in time_m2]
x_axis = list(zip(time_y2, time_m2))
l = len(list(set(time_y2)))
width = int(l * (1000/l))
p = figure(height = 300, width=width, x_range=FactorRange(*x_axis),tools='')
p.yaxis.axis_label = 'Price in {}'.format(currency)
p.y_range = Range1d(min(y2), max(y2))
p.extra_y_ranges = {'valuta': Range1d(start=min(x2), end=max(x2))}
p.add_layout(LinearAxis(y_range_name='valuta', axis_label='{} value in USD'.format(currency)), 'right')
p.xaxis.axis_label_text_font_size = "10pt"
p.xaxis.major_label_text_font_size = "5pt"
p.xaxis.axis_label = "Date"
p.title.text = "{} {} price in {}, and {} value in USD over time.".format(country, product, currency, currency)
p.line(x_axis, x2, y_range_name='valuta', color='orange', legend='{} value in USD'.format(currency))
p.line(x_axis, y2, legend='Price in {}'.format(currency))
p.toolbar.logo = None
p.toolbar_location = None
p.legend.location='top_right'

# year_start = int(x_axis[0][0])
# label_year = 2015
# label_month = 11
# label_loc = ((label_year - year_start) * 12) + label_month + 1.4*(label_year - year_start) - 0.5

code = '''\
if toggle.active
    object.background_fill_alpha = 1
    object.text_alpha = 1
    object.border_line_alpha = 1
    console.log 'enabling annotations'
else
    object.background_fill_alpha = 0
    object.text_alpha = 0
    object.border_line_alpha = 0
    console.log 'disabling annotations'
'''

labels = []
callbacks = []
toggle_names=[]
for month, year in event_dict[country]:
    yv = random.choice(y)
    year_start = int(x_axis[0][0])
    loc = ((year - year_start) * 12) + month + 1.4*(year - year_start) - 0.5
    events = list(df_c[(df_c['Month']==month) & (df_c['Year']==year) & (df_c['Country']==country)]['Event'])
    events = '\n'.join(events)
    label = Label(x=loc, y=yv, text=events, render_mode='css', border_line_color='black',
                 background_fill_color='white', background_fill_alpha=0, text_alpha=0, border_line_alpha=0, text_font_size='8pt')
    labels.append(label)
    callbacks.append(CustomJS.from_coffeescript(code=code, args = {}))
    toggle_names.append(str(month) + '-' + str(year))
    span = Span(location=loc, dimension='height', line_color='black', line_width=1, line_dash='dashed')
    p.add_layout(span)
    p.add_layout(label)

toggles=[]
for i in range(len(labels)):
    callback = callbacks[i]
    toggle = Toggle(label=toggle_names[i], button_type='success', callback=callback)
    callback.args = {'toggle':toggle, 'object': labels[i]}
    toggles.append(toggle)

# labels = LabelSet(x='time', y='value', text='text', level='glyph', x_offset=5, y_offset=5, 
#                   source=ColumnDataSource(label_source), render_mode='css', border_line_color='black', 
#                   background_fill_color='white', background_fill_alpha=1)
# p.add_layout(labels)





# callback1 = CustomJS.from_coffeescript(code=code, args={})
# toggle1 = Toggle(label="label", button_type="success", callback=callback1)
# callback1.args = {'toggle': toggle1, 'object': labels}
p.add_layout(Title(text="Click button to show conflict at corresponding date."), 'below') 
show(layout([p], toggles))


# TODO
# Add conflicts to plot



# Helper Fuctions

In [110]:
def interpolate(values):
    for i in range(len(values)):
        if values[i] == None:
            ii = i
            x = None
            while x == None:
                ii -= 1
                if ii < 0: x = values[i]
                x = values[ii]
            ii = i
            y = None
            while y == None:
                ii -= 1
                if ii >= len(values): y = values[i]
                y = values[ii]
            values[i] = (x + y) / 2
    return values

def average(x):
    return float(sum(x)) / len(x)

def correlation(x, y):
    n = len(x)
    avg_x = average(x)
    avg_y = average(y)
    dif = 0
    xdif2 = 0
    ydif2 = 0
    for i in range(n):
        xdif = x[i] - avg_x
        ydif = y[i] - avg_y
        dif += xdif * ydif
        xdif2 += xdif * xdif
        ydif2 += ydif * ydif
    return dif / math.sqrt(xdif2 * ydif2)

In [172]:
event_dict = {'Afghanistan': [(9, 2014), (3, 2015), (11, 2015), (9, 2016)],
             'Afgh2': [(10, 2004),(8, 2007), (9, 2008),(8, 2010),(3, 2015)],
             'Iran  (Islamic Republic of)':[(7, 2012), (6, 2013), (4, 2014), (7, 2015), (1, 2016)],
             'Iraq': [(6, 2014), (12, 2014), (4, 2016)],
             'Lebanon':[(9, 2012), (10, 2012), (9, 2013), (4, 2014), (1, 2015)],
             'Pakistan': [(7, 2007),(2,2008),(8,2008),(8,2010),(3,2014),(12,2014), (3,2016)],
             'Syrian Arab Republic': [(3, 2011), (6, 2012), (12, 2012), (9, 2013), (1,2014), (12,2016)],
             'Yemen': [(8,2009),(2,2010),(9,2010),(11,2011),(3,2015),(4,2016)]}