In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import json

from bokeh.io import push_notebook, output_notebook
from bokeh.plotting import figure, show
from bokeh.plotting import gmap
from bokeh.layouts import column, row
from bokeh.models import GMapOptions
from bokeh.palettes import brewer
from bokeh.models import CategoricalColorMapper
from bokeh.transform import factor_cmap
from bokeh.transform import linear_cmap
from bokeh.palettes import Spectral10
output_notebook()

In [None]:
#Loading the training data
train_data = pd.read_csv('./tanzania-X-train.csv', header=0)
train_target = pd.read_csv('./tanzania-y-train.csv', header=0)

In [None]:
# Basic info about the training data
train_data.info()

In [None]:
# Basic info about the numeric variables in the training data.
train_data.describe()

In [None]:
# Basic info about non-numeric variables
train_data.describe(include=['object','category'])

In [None]:
# Just getting an idea on the distribution of results here.
train_target['status_group'] = train_target['status_group'].astype('category')
print((train_target['status_group'] == 'functional').sum()/59400)
print((train_target['status_group'] == 'functional needs repair').sum()/59400)
print((train_target['status_group'] == 'non functional').sum()/59400)
train_target.describe()

In [None]:
# TRANSFORM if the latitude or longitude fall outside the actual area of Tanzania, set it to the center of the country
train_data.latitude = train_data.latitude.apply(lambda x: x if ((x < -1) and (x> -12)) else train_data.latitude.mean())
train_data.longitude = train_data.longitude.apply(lambda x: x if ((x < 41) and (x> 28)) else train_data.longitude.mean())
# Setting up options for map graphics.
mapoptions = GMapOptions(lat=train_data.latitude.mean(), lng=train_data.longitude.mean(), zoom=5)

In [None]:
GMapOptions.apply_theme?

In [None]:
#colors = brewer["Spectral"][10]
#colormap = {x: colors[i] for i, x in enumerate(train_target.status_group.unique())}
#colors = [colormap[x] for x in train_target.status_group]
#categories = list(train_target.status_group.unique())
#color_mapper = CategoricalColorMapper(factors=categories)
cmap = factor_cmap('gps_height', palette=Spectral10, factors=bin(train_data.gps_height, bins=10))
p1 = gmap('AIzaSyDdmZU-YmmPrBVhMarTxPBbEv7N_vdrXjY',mapoptions, plot_width=300, plot_height=300,
         title='Functional Water Points')
p1.circle(x='longitude', y='latitude', source=train_data[train_target['status_group'] == 'functional'],
         color='green')
p2 = gmap('AIzaSyDdmZU-YmmPrBVhMarTxPBbEv7N_vdrXjY',mapoptions, plot_width=300, plot_height=300,
         title='Water Points Needing Repair')
p2.circle(x='longitude', y='latitude', source=train_data[train_target['status_group'] == 'functional needs repair'],
         color='yellow')
p3 = gmap('AIzaSyDdmZU-YmmPrBVhMarTxPBbEv7N_vdrXjY',mapoptions, plot_width=300, plot_height=300,
         title='Non-Functional Water Points')
p3.circle(x='longitude', y='latitude', source=train_data[train_target['status_group'] == 'non functional'],
         color='red')
#show(p1)
show(row(p1,p2,p3))

In [None]:
def map_numeric_var(df, var):
    print(var)
    cmap = linear_cmap(var, Spectral10, df[var].min(), df[var].max())
    p1 = gmap('AIzaSyDdmZU-YmmPrBVhMarTxPBbEv7N_vdrXjY',mapoptions, plot_width=300, plot_height=300,
             title='Functional Water Points')
    p1.circle(x='longitude', y='latitude', source=train_data[train_target['status_group'] == 'functional'],
              color=cmap)
    p2 = gmap('AIzaSyDdmZU-YmmPrBVhMarTxPBbEv7N_vdrXjY',mapoptions, plot_width=300, plot_height=300,
             title='Water Points Needing Repair')
    p2.circle(x='longitude', y='latitude', source=train_data[train_target['status_group'] == 'functional needs repair'],
              color=cmap)
    p3 = gmap('AIzaSyDdmZU-YmmPrBVhMarTxPBbEv7N_vdrXjY',mapoptions, plot_width=300, plot_height=300,
             title='Non-Functional Water Points')
    p3.circle(x='longitude', y='latitude', source=train_data[train_target['status_group'] == 'non functional'],
              color=cmap)
    #show(p1)
    show(row(p1,p2,p3))

In [None]:
map_numeric_var(train_data, 'gps_height')

In [None]:
map_numeric_var(train_data, 'amount_tsh')

In [None]:
"""
Variables

amount_tsh - Total static head (amount water available to waterpoint)
date_recorded - The date the row was entered
funder - Who funded the well
gps_height - Altitude of the well
installer - Organization that installed the well
longitude - GPS coordinate
latitude - GPS coordinate
wpt_name - Name of the waterpoint if there is one
num_private -
basin - Geographic water basin
subvillage - Geographic location
region - Geographic location
region_code - Geographic location (coded)
district_code - Geographic location (coded)
lga - Geographic location
ward - Geographic location
population - Population around the well
public_meeting - True/False
recorded_by - Group entering this row of data
scheme_management - Who operates the waterpoint
scheme_name - Who operates the waterpoint
permit - If the waterpoint is permitted
construction_year - Year the waterpoint was constructed
extraction_type - The kind of extraction the waterpoint uses
extraction_type_group - The kind of extraction the waterpoint uses
extraction_type_class - The kind of extraction the waterpoint uses
management - How the waterpoint is managed
management_group - How the waterpoint is managed
payment - What the water costs
payment_type - What the water costs
water_quality - The quality of the water
quality_group - The quality of the water
quantity - The quantity of water
quantity_group - The quantity of water
source - The source of the water
source_type - The source of the water
source_class - The source of the water
waterpoint_type - The kind of waterpoint
waterpoint_type_group - The kind of waterpoint
"""

In [None]:
def histogram(df, var):
    x = list(df.status_group.unique())
    counts = [len(df[df['status_group'] == i][var]) for i in x]
    p = figure(x_range=x, title="{} count by Status".format(var),
           toolbar_location=None, tools="", plot_height=250)
    p.vbar(x=x, top=counts, width=0.9)
    p.xgrid.grid_line_color = None
    p.y_range.start = 0
    show(p)
    
    
def explore_variable(df, var):
    print('#'*80)
    print('Exploring variable: {}'.format(var))
    print(df[var].describe())
    df[var].hist()


In [None]:
continuous_vars = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population', 
                   'construction_year', 'payment', 'payment_type', 'water_quality', 'quantity']
date_vars = ['date_recorded']

In [None]:
for var in train_data.columns:
    length = len(train_data[var].unique())
    if length <= 25:
        train_data[var] = train_data[var].astype('category')
train_data.date_recorded = pd.to_datetime(train_data.date_recorded)

In [None]:
train_data.info()

In [None]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(train_data, train_target)

In [None]:
objvars = ['funder', 'installer', 'wpt_name', 'subvillage', 'lga', 'ward', 'scheme_name']

In [None]:
for var in objvars:
    print(var, len(train_data[var].unique()))

In [None]:
train_trimmed = train_data.drop(columns=objvars)

In [None]:
lm = LinearRegression()
lm.fit(train_trimmed, train_target)

In [None]:
train_trimmed.info()

In [None]:
train_trimmed[train_trimmed.amount_tsh == 'hand pump']

In [None]:
for continuous_var in continuous_vars:
    train_trimmed[[continuous_var]].boxplot(vert=True)
    plt.show()

In [None]:
for continuous_var in train_data.select_dtypes(['int64','float64']).columns.values:
    print(continuous_var)
    train_data[[continuous_var]].hist(bins=20, by=train_target.status_group, xrot=3/4)
    plt.show()

In [None]:
for var in train_data.select_dtypes('object').columns.values:
    print(var,train_data[var].value_counts()[:20])

In [None]:
def top_n_categories(n, df, var):
    top_n = list(df[var].value_counts()[:n].index.values)
    return df[var].apply(lambda x: 'Other' if x not in top_n else x).astype('category')

In [None]:
top_n_categories(20, train_data, 'funder')

In [None]:
pd.DataFrame.boxplot?

In [None]:
train_data.basin.value_counts()