In [35]:
# -*- coding: utf-8 -*-
"""
Created on Sun august 21 14:35:15 2016
@author: Sidon
"""
%matplotlib inline
import pandas as pd
import numpy as np
from collections import OrderedDict
from tabulate import tabulate, tabulate_formats
import seaborn
import matplotlib.pyplot as plt
import scipy.stats

# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%f'%x)

# Load from CSV
data1 = pd.read_csv('~/dev/coursera/gapminder.csv', skip_blank_lines=True,
                     usecols=['country','incomeperperson',
                              'alcconsumption', 'lifeexpectancy'])
 
# Rename columns for clarity                                    
data1.columns = ['country','income','alcohol','life']

# Variables Descriptions
INCOME = "2010 Gross Domestic Product per capita in constant 2000 US$"
ALCOHOL = "2008 alcohol consumption per adult (liters, age 15+)"
LIFE = "2011 life expectancy at birth (years)"


# converting to numeric values and parsing (numeric invalids=NaN)
for dt in ('alcohol','income', 'life') :
   data1[dt] = pd.to_numeric(data1[dt], 'errors=coerce') 

# Remove rows with nan values
data1 = data1.dropna(axis=0, how='any')

# Copy dataframe for univariate categorical variables
data2 = data1.copy()

### Variaveis:
|Explanatory|Response|Moderator|
|-----------:|-------:|--------:|
|Alcohol|Life|income

### Question
The income level effect direction or strength of the relationship between 
alcohol consumption and life expectancy?

### Creating Categorical Variables
Now is the time to create the categorical variables, for this, I calculate 
the min and max values of each variable. 

In [39]:
means = {}
means['alcohol'] = data1.alcohol.mean()
means['income'] =  data1.income.mean()

print ('Means')
print (tabulate([[m for m in means.values()]], tablefmt="fancy_grid", 
                headers=[k for k in means.keys()]))

Means
╒═══════════╤══════════╕
│   alcohol │   income │
╞═══════════╪══════════╡
│   6.78409 │  7006.36 │
╘═══════════╧══════════╛


In [37]:
# Create categorical variable alcohol (Two levels based on mean)

min_a = data1.alcohol.min()
max_a = data1.alcohol.max()

data2['alcohol'] = pd.cut(data1.alcohol, [np.floor(min_a),means['alcohol'],np.ceil(max_a)], 
                       labels=['<=6.8', '>6.8'])

data2["alcohol"] = data2["alcohol"].astype('category')

In [41]:
# Create categorical variable income (Two levels based on mean)

min_i = data1.income.min()
max_i = data1.income.max()

data2['income'] = pd.cut(data1.income, [np.floor(min_i),means['income'],np.ceil(max_i)], 
                       labels=['<=7006', '>7006'])

data2["income"] = data2["income"].astype('category')

                    country  income alcohol      life
1                   Albania  <=7006    >6.8 76.918000
2                   Algeria  <=7006   <=6.8 73.131000
4                    Angola  <=7006   <=6.8 51.093000
6                 Argentina   >7006    >6.8 75.901000
7                   Armenia  <=7006    >6.8 74.241000
9                 Australia   >7006    >6.8 81.907000
10                  Austria   >7006    >6.8 80.854000
11               Azerbaijan  <=7006    >6.8 70.739000
12                  Bahamas   >7006    >6.8 75.620000
13                  Bahrain   >7006   <=6.8 75.057000
14               Bangladesh  <=7006   <=6.8 68.944000
15                 Barbados   >7006   <=6.8 76.835000
16                  Belarus  <=7006    >6.8 70.349000
17                  Belgium   >7006    >6.8 80.009000
18                   Belize  <=7006   <=6.8 76.072000
19                    Benin  <=7006   <=6.8 56.081000
21                   Bhutan  <=7006   <=6.8 67.185000
22                  Bolivia 