In [17]:
!pip install fuzzywuzzy
!pip install ydata-profiling
!pip install python-Levenshtein
!pip install ipywidgets



#### Importing libreries:

In [18]:
import numpy as np
import pandas as pd
from collections import OrderedDict
from ydata_profiling import ProfileReport
import time
import datetime
from fuzzywuzzy import process
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
pd.set_option('display.max_columns', None)

# Hypotesis 1: 

## People are happier in countries with higher productivity levels.

Data sources available to analyse H1:
- GDP per Capita vs Working hours
- Productivity vs Working Hours  

#### Loading the DataFrames:

### GDP per Capita vs Working hours

**Source:** Our World in Data

[Annual working hours vs. GDP per capita dataset](https://ourworldindata.org/grapher/annual-working-hours-vs-gdp-per-capita-pwt)

_Working hours are the annual average per worker._

In [20]:
working_hours = pd.read_csv("1.Annual-Working-Hours-vs-GDP-per-capita-pwt.csv")

In [21]:
working_hours.columns

Index(['Entity', 'Code', 'Year', 'Annual working hours per worker',
       'GDP per capita (output, multiple price benchmarks)',
       'Population (historical estimates)', 'Continent'],
      dtype='object')

In [22]:
working_hours.dtypes

Entity                                                 object
Code                                                   object
Year                                                    int64
Annual working hours per worker                       float64
GDP per capita (output, multiple price benchmarks)    float64
Population (historical estimates)                     float64
Continent                                              object
dtype: object

In [23]:
working_hours.shape

(56777, 7)

In [24]:
working_hours

Unnamed: 0,Entity,Code,Year,Annual working hours per worker,"GDP per capita (output, multiple price benchmarks)",Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,0,,,2000000.0,
2,Afghanistan,AFG,100,,,2250000.0,
3,Afghanistan,AFG,200,,,2500000.0,
4,Afghanistan,AFG,300,,,2500000.0,
...,...,...,...,...,...,...,...
56772,Zimbabwe,ZWE,2019,,2787.659,15271377.0,
56773,Zimbabwe,ZWE,2020,,,15526888.0,
56774,Zimbabwe,ZWE,2021,,,15797220.0,
56775,Zimbabwe,ZWE,2022,,,16069061.0,


In [25]:
# Pivot table: Calculating the average of working hours by continent, country and year
working_hours.pivot_table(
    values = "Annual working hours per worker",
    index = ["Continent", "Entity"],
    aggfunc = "mean"
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Annual working hours per worker
Continent,Entity,Unnamed: 2_level_1
Asia,Bangladesh,2418.7507
Asia,Cambodia,2458.3090
Asia,China,2170.4592
Asia,Hong Kong,2185.5137
Asia,India,2118.8215
...,...,...
South America,Chile,1988.0000
South America,Colombia,1997.0325
South America,Ecuador,1620.1982
South America,Peru,2133.8416


In [26]:
# Sorting by multiple columns in descending order (main GDP, then Annual Working Hours)
working_hours.sort_values(by=["GDP per capita (output, multiple price benchmarks)", "Annual working hours per worker"], ascending=False)

Unnamed: 0,Entity,Code,Year,Annual working hours per worker,"GDP per capita (output, multiple price benchmarks)",Population (historical estimates),Continent
41324,Qatar,QAT,2012,,169200.27,1907121.0,
41323,Qatar,QAT,2011,,167820.64,1810958.0,
41325,Qatar,QAT,2013,,164968.48,2032650.0,
41320,Qatar,QAT,2008,,161521.75,1426942.0,
41326,Qatar,QAT,2014,,147262.56,2218377.0,
...,...,...,...,...,...,...,...
56706,Zimbabwe,ZWE,1953,,,3067169.0,
56773,Zimbabwe,ZWE,2020,,,15526888.0,
56774,Zimbabwe,ZWE,2021,,,15797220.0,
56775,Zimbabwe,ZWE,2022,,,16069061.0,


In [27]:
#TODO Rank 1: Countries with the highest GDP per capita

In [None]:
#TODO Rank 2: Countries with less working hours

In [None]:
#TODO Index A: Ranking of countries in a combination between Rank 1 and 2.

In [28]:
profile = ProfileReport(working_hours, title="Profiling Report")

In [29]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



### Productivity vs Working Hours

**Source:** Our World in Data

[Annual working hours vs. labor productivity](https://ourworldindata.org/grapher/productivity-vs-annual-hours-worked)

_Labor productivity is measured as GDP per hour of work._

In [31]:
productivity = pd.read_csv("2.Productivity-vs-Annual-Hours-Worked.csv")

In [32]:
productivity
# Entity is the countries names.
# Code stands for the country code (alpha-3).
# Year for the observation.
# Annual working hours per worker 

Unnamed: 0,Entity,Code,Year,Annual working hours per worker,Productivity: output per hour worked,Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,-10000,,,14737.0,
2,Afghanistan,AFG,-9000,,,20405.0,
3,Afghanistan,AFG,-8000,,,28253.0,
4,Afghanistan,AFG,-7000,,,39120.0,
...,...,...,...,...,...,...,...
58631,Zimbabwe,ZWE,2019,,,15271377.0,
58632,Zimbabwe,ZWE,2020,,,15526888.0,
58633,Zimbabwe,ZWE,2021,,,15797220.0,
58634,Zimbabwe,ZWE,2022,,,16069061.0,


In [33]:
productivity.shape
# Productivity has 58,636 entries and 7 columns

(58636, 7)

In [34]:
productivity.dtypes

Entity                                   object
Code                                     object
Year                                      int64
Annual working hours per worker         float64
Productivity: output per hour worked    float64
Population (historical estimates)       float64
Continent                                object
dtype: object

In [35]:
productivity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58636 entries, 0 to 58635
Data columns (total 7 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Entity                                58636 non-null  object 
 1   Code                                  55413 non-null  object 
 2   Year                                  58636 non-null  int64  
 3   Annual working hours per worker       3457 non-null   float64
 4   Productivity: output per hour worked  3457 non-null   float64
 5   Population (historical estimates)     58589 non-null  float64
 6   Continent                             285 non-null    object 
dtypes: float64(3), int64(1), object(3)
memory usage: 3.1+ MB


In [36]:
productivity.columns

Index(['Entity', 'Code', 'Year', 'Annual working hours per worker',
       'Productivity: output per hour worked',
       'Population (historical estimates)', 'Continent'],
      dtype='object')

In [37]:
productivity.sort_values(by="Productivity: output per hour worked", ascending=False)
#TODO filtar por population 

Unnamed: 0,Entity,Code,Year,Annual working hours per worker,Productivity: output per hour worked,Population (historical estimates),Continent
39048,Norway,NOR,2012,1396.2837,129.025420,5018456.0,
39049,Norway,NOR,2013,1386.7217,125.781000,5080125.0,
25154,Ireland,IRL,2019,1771.9779,125.092540,4933491.0,
39047,Norway,NOR,2011,1400.1548,124.166626,4952968.0,
39044,Norway,NOR,2008,1406.7863,123.317750,4768099.0,
...,...,...,...,...,...,...,...
58631,Zimbabwe,ZWE,2019,,,15271377.0,
58632,Zimbabwe,ZWE,2020,,,15526888.0,
58633,Zimbabwe,ZWE,2021,,,15797220.0,
58634,Zimbabwe,ZWE,2022,,,16069061.0,


In [38]:
#TODO Rank 3: Sort countries with the highest productivity

In [None]:
#TODO Rank 4: Sort countries with less working hours

In [None]:
#TODO Index B: Ranking of countries in a combination between Rank 3 and 4

In [39]:
profile = ProfileReport(productivity, title="Profiling Report")

In [40]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

