In [1]:
import numpy as np
from collections import defaultdict
import pandas as pd
import unicodecsv

In [2]:
with open('master.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    master = list(reader)

In [3]:
master[0]

OrderedDict([('\ufeffcountry', 'Albania'),
             ('year', '1987'),
             ('sex', 'male'),
             ('age', '15-24 years'),
             ('suicides_no', '21'),
             ('population', '312900'),
             ('suicides/100k pop', '6.71'),
             ('country-year', 'Albania1987'),
             ('HDI for year', ''),
             (' gdp_for_year ($) ', '2,156,624,900'),
             ('gdp_per_capita ($)', '796'),
             ('generation', 'Generation X')])

In [4]:
# takes a string which is either an empty string or represents an integer,
# and returns an int or None

def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)
    
def parse_maybe_float(i):
    if i == '':
        return None
    else:
        return float(i)
    
def parse_maybe_none(i):
    if i == '':
        return None
    
def parse_int_with_comma(i):
    if i == '':
        return None
    else:
        return int(i.replace(',',''))
    
    
# int(datum['gdp_for_year'].replace(',',''))

In [5]:
for data in master:
    data['country'] = data['\ufeffcountry']
    del[data['\ufeffcountry']]
    data['suicides_per_100k_pop'] = data['suicides/100k pop']
    del[data['suicides/100k pop']]
    data['country_year'] = data['country-year']
    del[data['country-year']]
    data['HDI_for_year'] = data['HDI for year']
    del[data['HDI for year']]
    data['gdp_for_year'] = data[' gdp_for_year ($) ']
    del[data[' gdp_for_year ($) ']]
    data['gdp_per_capita'] = data['gdp_per_capita ($)']
    del[data['gdp_per_capita ($)']]

In [6]:
master[0]

OrderedDict([('year', '1987'),
             ('sex', 'male'),
             ('age', '15-24 years'),
             ('suicides_no', '21'),
             ('population', '312900'),
             ('generation', 'Generation X'),
             ('country', 'Albania'),
             ('suicides_per_100k_pop', '6.71'),
             ('country_year', 'Albania1987'),
             ('HDI_for_year', ''),
             ('gdp_for_year', '2,156,624,900'),
             ('gdp_per_capita', '796')])

In [7]:
master[0]

OrderedDict([('year', '1987'),
             ('sex', 'male'),
             ('age', '15-24 years'),
             ('suicides_no', '21'),
             ('population', '312900'),
             ('generation', 'Generation X'),
             ('country', 'Albania'),
             ('suicides_per_100k_pop', '6.71'),
             ('country_year', 'Albania1987'),
             ('HDI_for_year', ''),
             ('gdp_for_year', '2,156,624,900'),
             ('gdp_per_capita', '796')])

In [8]:
for data in master:
    data['year'] = parse_maybe_int(data['year'])
    data['suicides_no'] = parse_maybe_int(data['suicides_no'])
    data['population'] = parse_maybe_int(data['population'])
    data['suicides_per_100k_pop'] = parse_maybe_float(data['suicides_per_100k_pop'])
    data['HDI_for_year'] = parse_maybe_none(data['HDI_for_year'])
    data['gdp_for_year'] = parse_int_with_comma(data['gdp_for_year'])
    data['gdp_per_capita'] = parse_maybe_int(data['gdp_per_capita'])
    

In [9]:
master[0]

OrderedDict([('year', 1987),
             ('sex', 'male'),
             ('age', '15-24 years'),
             ('suicides_no', 21),
             ('population', 312900),
             ('generation', 'Generation X'),
             ('country', 'Albania'),
             ('suicides_per_100k_pop', 6.71),
             ('country_year', 'Albania1987'),
             ('HDI_for_year', None),
             ('gdp_for_year', 2156624900),
             ('gdp_per_capita', 796)])

In [10]:
master[1]

OrderedDict([('year', 1987),
             ('sex', 'male'),
             ('age', '35-54 years'),
             ('suicides_no', 16),
             ('population', 308000),
             ('generation', 'Silent'),
             ('country', 'Albania'),
             ('suicides_per_100k_pop', 5.19),
             ('country_year', 'Albania1987'),
             ('HDI_for_year', None),
             ('gdp_for_year', 2156624900),
             ('gdp_per_capita', 796)])

In [11]:
# master_df = pd.DataFrame(master)
# master_df.head()

In [12]:
len(master)

27820

In [13]:
def get_unique_countries(data):
    unique_countries = set()
    for data_point in data:
        unique_countries.add(data_point['country'])
    return unique_countries

In [14]:
unique_countries = get_unique_countries(master)
len(unique_countries)

101

In [15]:
data_by_country = defaultdict(list)
for data in master:
    country = data['country']
    data_by_country[country].append(data)
    
# data_by_country['Sri Lanka']

In [16]:
total_pop_by_country = {}

for country, data in data_by_country.items():
    total_population = 0
    for datum in data:
        total_population += datum['population']
    total_pop_by_country[country] = total_population
    
# total_pop_by_country

In [17]:
# def total_by_country(i):
#     total_value_by_country = {}
#     for key,value in i.items():
#         total_value = 0
#         for datum in value:
#             total_value += datum[]

In [18]:
total_gdp_by_country = {}

for country, data in data_by_country.items():
    total_gdp = 0
    for datum in data:
        total_gdp += datum['gdp_for_year']
    total_gdp_by_country[country] = total_gdp
    
# total_gdp_by_country

In [19]:
total_gdp_per_capita_country = {}

for country, data in data_by_country.items():
    total_gdp_capita = 0
    for datum in data:
        total_gdp_capita += datum['gdp_per_capita']
    total_gdp_per_capita_country[country] = total_gdp_capita
    
# total_gdp_per_capita_country

In [20]:
total_suicide_by_country = {}

for country, data in data_by_country.items():
    total_suicide = 0
    for datum in data:
        total_suicide += datum['suicides_no']
    total_suicide_by_country[country] = total_suicide
    
# total_suicide_by_country

In [21]:
total_suicides_per_100k_country = {}

for country, data in data_by_country.items():
    total_suicides_per_100k = 0
    for datum in data:
        total_suicides_per_100k += datum['suicides_per_100k_pop']
    total_suicides_per_100k_country[country] = total_suicides_per_100k
    
# total_suicides_per_100k_country

In [22]:
def get_max(data):
    country_max = None
    max_value = 0
    for key, value in data.items():
        if value > max_value:
            max_value = value
            country_max = key
    return country_max, max_value

get_max(total_suicide_by_country)

('Russian Federation', 1209742)

In [23]:
def get_min(data):
    country_min = get_max(data)[0]
    min_value = get_max(data)[1]
    for key, value in data.items():
        if value < min_value:
            min_value = value
            country_min = key
    return country_min, min_value

In [24]:
country_with_max_pop, max_pop = get_max(total_pop_by_country)
country_with_min_pop, min_pop = get_min(total_pop_by_country)
    
print(country_with_max_pop, max_pop)
print(country_with_min_pop, min_pop)

United States 8054027201
Dominica 66400


In [25]:
country_with_max_gdp, max_gdp = get_max(total_gdp_by_country)
country_with_min_gdp, min_gdp = get_min(total_gdp_by_country)
        
print(country_with_max_gdp, max_gdp)
print(country_with_min_gdp, min_gdp)

United States 3909984936000000
Dominica 1183022220


In [26]:
country_with_max_gdp_capita, max_gdp_capita = get_max(total_gdp_per_capita_country)
country_with_min_gdp_capita, min_gdp_capita = get_min(total_gdp_per_capita_country)
        
print(country_with_max_gdp_capita, max_gdp_capita)
print(country_with_min_gdp_capita, min_gdp_capita)

Luxembourg 25593000
Dominica 17820


In [27]:
country_with_max_suicide_no, max_suicide_no = get_max(total_suicide_by_country)
country_with_min_suicide_no, min_suicide_no = get_min(total_suicide_by_country)

print(country_with_max_suicide_no, max_suicide_no)
print(country_with_min_suicide_no, min_suicide_no)

Russian Federation 1209742
Dominica 0


In [28]:
country_max_suicides_per_100k, max_suicide_per_100k = get_max(total_suicides_per_100k_country)
country_min_suicides_per_100k, min_suicide_per_100k = get_min(total_suicides_per_100k_country)

print(country_max_suicides_per_100k, max_suicide_per_100k)
print(country_min_suicides_per_100k, min_suicide_per_100k)

Russian Federation 11305.130000000006
Dominica 0.0


In [29]:
total_suicide_no = list(total_suicide_by_country.values())
print('Mean:', np.mean(total_suicide_no))
print('Std:', np.std(total_suicide_no))
print('Max:', np.max(total_suicide_no))
print('Min:', np.min(total_suicide_no))

Mean: 66816.0396039604
Std: 181256.8961922441
Max: 1209742
Min: 0


In [30]:
total_suicide_per_100k = list(total_suicides_per_100k_country.values())
print('Mean:', np.mean(total_suicide_per_100k))
print('Std:', np.std(total_suicide_per_100k))
print('Max:', np.max(total_suicide_per_100k))
print('Min:', np.min(total_suicide_per_100k))

Mean: 3530.1369306930687
Std: 2921.4992694473954
Max: 11305.130000000006
Min: 0.0


In [31]:
total_gdp = list(total_gdp_by_country.values())
print('Mean:', np.mean(total_gdp))
print('Std:', np.std(total_gdp))
print('Max:', np.max(total_gdp))
print('Min:', np.min(total_gdp))

Mean: 122733292656393.2
Std: 434356422069337.0
Max: 3909984936000000
Min: 1183022220


In [32]:
total_gdp_capita = list(total_gdp_per_capita_country.values())
print('Mean:', np.mean(total_gdp_capita))
print('Std:', np.std(total_gdp_capita))
print('Max:', np.max(total_gdp_capita))
print('Min:', np.min(total_gdp_capita))

Mean: 4645792.475247525
Std: 5288472.289673265
Max: 25593000
Min: 17820


In [33]:
total_pop = list(total_pop_by_country.values())
print('Mean:', np.mean(total_pop))
print('Std:', np.std(total_pop))
print('Max:', np.max(total_pop))
print('Min:', np.min(total_pop))

Mean: 508140182.5346535
Std: 1117379983.8164601
Max: 8054027201
Min: 66400
