In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
loc = 'world-happiness'

In [3]:
l = os.listdir(loc)
print(l)

['2015.csv', '2016.csv', '2017.csv', '2018.csv', '2019.csv']


In [4]:
report2015 = pd.read_csv(os.path.join(loc,'2015.csv'))[['Country','Region','Happiness Score']]
report2016 = pd.read_csv(os.path.join(loc,'2016.csv'))[['Country','Region','Happiness Score']]
report2017 = pd.read_csv(os.path.join(loc,'2017.csv'))[['Country','Happiness.Score']]

# Renaming 2017 index 'Happiness.Score' as 'Happiness Score'
report2017 = report2017.rename(columns={'Happiness.Score':'Happiness Score'})

In [5]:
# Upper Casing Each Column name and Region Name, so that difference due to Cases do not appear
report2015['Country'] = report2015['Country'].map(lambda name:name.upper())
report2016['Country'] = report2016['Country'].map(lambda name:name.upper())
report2017['Country'] = report2017['Country'].map(lambda name:name.upper())

report2015['Region'] = report2015['Region'].map(lambda name:name.upper())
report2016['Region'] = report2016['Region'].map(lambda name:name.upper())

In [6]:
report2015.head(2)

Unnamed: 0,Country,Region,Happiness Score
0,SWITZERLAND,WESTERN EUROPE,7.587
1,ICELAND,WESTERN EUROPE,7.561


In [7]:
report2016.head(2)

Unnamed: 0,Country,Region,Happiness Score
0,DENMARK,WESTERN EUROPE,7.526
1,SWITZERLAND,WESTERN EUROPE,7.509


In [8]:
report2017.head(2)

Unnamed: 0,Country,Happiness Score
0,NORWAY,7.537
1,DENMARK,7.522


In [9]:
len(report2015),len(report2016),len(report2017)

(158, 157, 155)

In [10]:
# We see that there is a difference in between country lengths, let's see which countries are missing in each years
def getDifferences(countries2015,countries2016,countries2017):
    print("Countries which are in 2015 data but not in 2016 data: ")
    for i in countries2015:
        if not i in countries2016:
            print(i)
    print("Countries which are in 2016 data but not in 2015 data: ")
    for i in countries2016:
        if not i in countries2015:
            print(i)
    print("Countries which are in 2015 data but not in 2017 data: ")
    for i in countries2015:
        if not i in countries2017:
            print(i)
    print("Countries which are in 2017 data but not in 2015 data: ")
    for i in countries2017:
        if not i in countries2015:
            print(i)
    print("Countries which are in 2016 data but not in 2017 data: ")
    for i in countries2016:
        if not i in countries2017:
            print(i)
    print("Countries which are in 2017 data but not in 2016 data: ")
    for i in countries2017:
        if not i in countries2016:
            print(i)
getDifferences(report2015['Country'].values,report2016['Country'].values,report2017['Country'].values)

Countries which are in 2015 data but not in 2016 data: 
OMAN
MOZAMBIQUE
LESOTHO
SWAZILAND
DJIBOUTI
CENTRAL AFRICAN REPUBLIC
Countries which are in 2016 data but not in 2015 data: 
PUERTO RICO
BELIZE
SOMALIA
NAMIBIA
SOUTH SUDAN
Countries which are in 2015 data but not in 2017 data: 
OMAN
TAIWAN
SURINAME
HONG KONG
SOMALILAND REGION
LAOS
SWAZILAND
DJIBOUTI
COMOROS
Countries which are in 2017 data but not in 2015 data: 
TAIWAN PROVINCE OF CHINA
BELIZE
HONG KONG S.A.R., CHINA
SOMALIA
NAMIBIA
SOUTH SUDAN
Countries which are in 2016 data but not in 2017 data: 
PUERTO RICO
TAIWAN
SURINAME
HONG KONG
SOMALILAND REGION
LAOS
COMOROS
Countries which are in 2017 data but not in 2016 data: 
TAIWAN PROVINCE OF CHINA
HONG KONG S.A.R., CHINA
MOZAMBIQUE
LESOTHO
CENTRAL AFRICAN REPUBLIC


In [11]:
# From closely looking, it is visible that few countries are missing from each year, but specifically,
# "Hong Kong" and "Taiwan" have different names in 2017, so renaming them will output lesser differences
report2017 = report2017.replace('TAIWAN PROVINCE OF CHINA','TAIWAN')
report2017 = report2017.replace('HONG KONG S.A.R., CHINA','HONG KONG')

In [12]:
print("Printing the countries which are in 2017 dataset but not in either 2015 dataset or in 2016 dataset (If Any): ")
for i in report2017['Country'].values:
    if not (i in report2015['Country'].values or i in report2016['Country'].values):
        print(i)

Printing the countries which are in 2017 dataset but not in either 2015 dataset or in 2016 dataset (If Any): 


In [13]:
# Now Since there are no such countries, we can add a 'Region' label in 2017 dataset
regions = []
countries = report2017['Country'].values
countries2015 = report2015['Country'].values
countries2016 = report2016['Country'].values
for country in countries:
    if country in countries2015: # If that country exists in 2015 list
        regions.append(report2015.loc[report2015.Country == country]['Region'].values[0])
    elif country in countries2016: # If that country exists in 2016 list
        regions.append(report2016.loc[report2016.Country == country]['Region'].values[0])
    else: print("Country found neither in 2015 list nor in 2016 list")

# Adding the 'Region' column to 2017 dataframe
report2017['Region'] = regions

In [14]:
report2017.head()

Unnamed: 0,Country,Happiness Score,Region
0,NORWAY,7.537,WESTERN EUROPE
1,DENMARK,7.522,WESTERN EUROPE
2,ICELAND,7.504,WESTERN EUROPE
3,SWITZERLAND,7.494,WESTERN EUROPE
4,FINLAND,7.469,WESTERN EUROPE


## Starting to answer the following questions
* *Countries with a happiness score of less than 5.0*
* *Which is the unhappiest country in Sub-Saharan Africa?*
* *Compare the Unhappiest and happiest countries in each region*
* *Countries that became unhappy between 2015 to 2017*
* *Find the country whose happiness decreased by the most amount*

### 1. Countries with Happiness Score of less than 5.0

In [15]:
# First gather down the scores of all the 3 years, then print out the unique ones
countrylis = []
l1 = list(report2015.loc[report2015['Happiness Score'] < 5.0]['Country'].values)
l2 = list(report2016.loc[report2016['Happiness Score'] < 5.0]['Country'].values)
l3 = list(report2017.loc[report2017['Happiness Score'] < 5.0]['Country'].values)
countrylis = l1+l2+l3

In [16]:
print("The Names of the Countries that has happiness score < 5.0 in either 2015,2016 or 2017 are: \n",set(countrylis))

The Names of the Countries that has happiness score < 5.0 in either 2015,2016 or 2017 are: 
 {'HONDURAS', 'CONGO (KINSHASA)', 'UKRAINE', 'HUNGARY', 'TANZANIA', 'INDIA', 'CHAD', 'SUDAN', 'RWANDA', 'MALI', 'CENTRAL AFRICAN REPUBLIC', 'MALAWI', 'IVORY COAST', 'BANGLADESH', 'SWAZILAND', 'KENYA', 'TUNISIA', 'SOUTH AFRICA', 'ANGOLA', 'BENIN', 'GABON', 'GEORGIA', 'LAOS', 'LEBANON', 'SIERRA LEONE', 'TOGO', 'DOMINICAN REPUBLIC', 'MOZAMBIQUE', 'MAURITANIA', 'BOTSWANA', 'ALBANIA', 'TAJIKISTAN', 'DJIBOUTI', 'CONGO (BRAZZAVILLE)', 'SRI LANKA', 'EGYPT', 'YEMEN', 'UGANDA', 'SENEGAL', 'BULGARIA', 'BURKINA FASO', 'MONGOLIA', 'MYANMAR', 'NAMIBIA', 'ARMENIA', 'NIGER', 'NIGERIA', 'ETHIOPIA', 'IRAQ', 'ZAMBIA', 'NEPAL', 'COMOROS', 'BURUNDI', 'CAMEROON', 'HAITI', 'ZIMBABWE', 'GREECE', 'IRAN', 'LESOTHO', 'SOUTH SUDAN', 'MADAGASCAR', 'LIBERIA', 'GHANA', 'CAMBODIA', 'SYRIA', 'BOSNIA AND HERZEGOVINA', 'AFGHANISTAN', 'PALESTINIAN TERRITORIES', 'GUINEA'}


### 2. Which is the  unhappiest country in Sub-Saharan Africa?
Since the countries are arranged in reverse order of their happiness score, we can simply find out the last country in the list of the 3 years, and find out the country with minimum happiness amongst these 3.

In [17]:
def UnhappiestInRegion(regionname,report2015,report2016,report2017):
    subsaharan2015 = report2015.loc[report2015['Region']==regionname]
    subsaharan2016 = report2016.loc[report2016['Region']==regionname]
    subsaharan2017 = report2017.loc[report2017['Region']==regionname]
    
    min2015 = subsaharan2015.values[-1]
    min2016 = subsaharan2016.values[-1]
    min2017 = subsaharan2017.values[-1]
    
    minscore = min(min2015[-1],min2016[-1],min2017[1])
    mincont = 'None'
    if min2015[-1] == minscore: mincont = min2015[0]
    elif min2016[-1] == minscore: mincont = min2016[0]
    elif min2017[1] == minscore: mincont = min2017[0]
    
    return minscore,mincont

In [18]:
minscore,mincont = UnhappiestInRegion('SUB-SAHARAN AFRICA',report2015,report2016,report2017)
print("Over the all three years, the Most Unhappiest Country is :", mincont , "with happiness score of ",minscore)

Over the all three years, the Most Unhappiest Country is : CENTRAL AFRICAN REPUBLIC with happiness score of  2.69300007820129


### 3. Compare the Unhappiest and happiest countries in each region

In [19]:
# Since we have already made a function to find out unhappiest country in particular region,
#  now making function to find out happiest in particular region
def HappiestInRegion(regionname,report2015,report2016,report2017):
    subsaharan2015 = report2015.loc[report2015['Region']==regionname]
    subsaharan2016 = report2016.loc[report2016['Region']==regionname]
    subsaharan2017 = report2017.loc[report2017['Region']==regionname]
    
    max2015 = subsaharan2015.values[0]
    max2016 = subsaharan2016.values[0]
    max2017 = subsaharan2017.values[0]
    
    maxscore = max(max2015[-1],max2016[-1],max2017[1])
    maxcont = 'None'
    if max2015[-1] == maxscore: maxcont = max2015[0]
    elif max2016[-1] == maxscore: maxcont = max2016[0]
    elif max2017[1] == maxscore: maxcont = max2017[0]
    
    return maxscore,maxcont

In [20]:
# Since all the 3 year dataframes contain same regions, we need not take all unique regions from all the 3 datasets
regions = report2015.Region.unique()
score_count = []
for region in regions:
    unhappiest = UnhappiestInRegion(region,report2015,report2016,report2017)
    happiest   = HappiestInRegion(region,report2015,report2016,report2017)
    score_count.append([region,happiest,unhappiest])

In [21]:
for i in score_count:
    print("Region:",i[0],'\nHappiest Country is',i[1][1], 'with score of',round(i[1][0],3),'. Unhappiest Country is',i[2][1],'with score of',round(i[2][0],2))

Region: WESTERN EUROPE 
Happiest Country is SWITZERLAND with score of 7.587 . Unhappiest Country is GREECE with score of 4.86
Region: NORTH AMERICA 
Happiest Country is CANADA with score of 7.427 . Unhappiest Country is UNITED STATES with score of 6.99
Region: AUSTRALIA AND NEW ZEALAND 
Happiest Country is NEW ZEALAND with score of 7.334 . Unhappiest Country is AUSTRALIA with score of 7.28
Region: MIDDLE EAST AND NORTHERN AFRICA 
Happiest Country is ISRAEL with score of 7.278 . Unhappiest Country is SYRIA with score of 3.01
Region: LATIN AMERICA AND CARIBBEAN 
Happiest Country is COSTA RICA with score of 7.226 . Unhappiest Country is HAITI with score of 3.6
Region: SOUTHEASTERN ASIA 
Happiest Country is SINGAPORE with score of 6.798 . Unhappiest Country is CAMBODIA with score of 3.82
Region: CENTRAL AND EASTERN EUROPE 
Happiest Country is CZECH REPUBLIC with score of 6.609 . Unhappiest Country is UKRAINE with score of 4.1
Region: EASTERN ASIA 
Happiest Country is TAIWAN with score of 6

### 4. Countries that became unhappy between 2015 to 2017
Since there are some countries that are there in 2015 but not in 2017 and vice versa, we will find out the intersection of the countries from both datasets (i.e, Countries which is in both dataset) and then do the comparison of happiness

In [22]:
# Function to return the intersection of two lists
def intersection(lst1, lst2): return list(set(lst1) & set(lst2))

In [23]:
countries = intersection(report2015['Country'],report2017['Country'].values)

#sorting the country names in ascending order
countries.sort()

In [29]:
score2015 = report2015.loc[report2015['Country'].map(lambda x: x in countries)][['Country','Happiness Score']]
score2017 = report2017.loc[report2017['Country'].map(lambda x: x in countries)][['Country','Happiness Score']]

# Sort both by Country Names in Ascending Order, so it is easy to subtract later
score2015 = score2015.sort_values(by=['Country'])
score2017 = score2017.sort_values(by=['Country'])

In [30]:
score2015.head()

Unnamed: 0,Country,Happiness Score
152,AFGHANISTAN,3.575
94,ALBANIA,4.959
67,ALGERIA,5.605
136,ANGOLA,4.033
29,ARGENTINA,6.574


In [31]:
score2017.head()

Unnamed: 0,Country,Happiness Score
140,AFGHANISTAN,3.794
108,ALBANIA,4.644
52,ALGERIA,5.872
139,ANGOLA,3.795
23,ARGENTINA,6.599


In [37]:
scorediff = list(score2015['Happiness Score'].values - score2017['Happiness Score'].values)
for i in range(0,len(scorediff)):
    if scorediff[i]<0:
        print('The Happiness of the country',countries[i],'decreased from 2015 to 2017')

The Happiness of the country AFGHANISTAN decreased from 2015 to 2017
The Happiness of the country ALGERIA decreased from 2015 to 2017
The Happiness of the country ARGENTINA decreased from 2015 to 2017
The Happiness of the country ARMENIA decreased from 2015 to 2017
The Happiness of the country AZERBAIJAN decreased from 2015 to 2017
The Happiness of the country BAHRAIN decreased from 2015 to 2017
The Happiness of the country BENIN decreased from 2015 to 2017
The Happiness of the country BOSNIA AND HERZEGOVINA decreased from 2015 to 2017
The Happiness of the country BULGARIA decreased from 2015 to 2017
The Happiness of the country BURKINA FASO decreased from 2015 to 2017
The Happiness of the country CAMBODIA decreased from 2015 to 2017
The Happiness of the country CAMEROON decreased from 2015 to 2017
The Happiness of the country CHAD decreased from 2015 to 2017
The Happiness of the country CHINA decreased from 2015 to 2017
The Happiness of the country CONGO (BRAZZAVILLE) decreased from 2

### 5. Find the country whose happiness decreased by the most amount
Since we already have the difference in scores of Happiness from initial year (2015) to final year (2017), the minimum of that list will be the country with the most decrease in happiness

In [38]:
indexMostDecrease = scorediff.index(min(scorediff))

In [40]:
print("The Country whose happiness is decreased by most amount is :", countries[indexMostDecrease])

The Country whose happiness is decreased by most amount is : LATVIA
