In [4]:
import matplotlib.pyplot as plt # for ploting the graph
import numpy as np   # for mathematics
import pandas as pd   # This is used for panda
import sklearn.linear_model # this is used for linear model

In [5]:
#thousands: To correctly parse large numbers formatted with commas as thousands separators.
#delimiter: To correctly split columns when the file uses tabs instead of commas as separators.
#encoding: To handle files with special characters that might not be correctly interpreted with the default encoding.
#na_values: To ensure that placeholders for missing data are correctly interpreted as NaN

# for Example below that
#Country\tGDP_per_capita
#United States\t50,000
#Canada\t45,000
#Germany\tn/a
#France\t42,000

#convert to this 
#         Country  GDP_per_capita
#0  United States          50000.0
#1         Canada          45000.0
#2        Germany              NaN
#3         France          42000.0


In [29]:
# Load the data
oecd_bli = pd.read_csv("all datasets/oecd_bli_2015.csv", thousands=',')  # thousands: This is useful when dealing with large numbers in a format like 1,000 or 1,000,000, ensuring they are correctly parsed as 1000 or 1000000 
gdp_per_capita = pd.read_csv("all datasets/gdp_per_capita.csv", thousands=',', delimiter=',', encoding='latin1', na_values="n/a")


In [30]:
print (oecd_bli.shape)
print (gdp_per_capita.shape)
print (oecd_bli.head(3))
print (gdp_per_capita.head(3))

# Clean 'Country' columns remove space, while this column create a problem for that, I remove applied space, to mach properly, below
gdp_per_capita['Country'] = gdp_per_capita['Country'].str.strip()
oecd_bli['Country'] = oecd_bli['Country'].str.strip()

# Inspect the cleaned DataFrames
print(gdp_per_capita.head())
print(oecd_bli.head())


(3292, 17)
(190, 7)
  LOCATION    Country INDICATOR                           Indicator MEASURE  \
0      AUS  Australia   HO_BASE  Dwellings without basic facilities       L   
1      AUT    Austria   HO_BASE  Dwellings without basic facilities       L   
2      BEL    Belgium   HO_BASE  Dwellings without basic facilities       L   

  Measure INEQUALITY Inequality Unit Code        Unit  PowerCode Code  \
0   Value        TOT      Total        PC  Percentage               0   
1   Value        TOT      Total        PC  Percentage               0   
2   Value        TOT      Total        PC  Percentage               0   

  PowerCode  Reference Period Code  Reference Period  Value Flag Codes  \
0     units                    NaN               NaN    1.1          E   
1     units                    NaN               NaN    1.0        NaN   
2     units                    NaN               NaN    2.0        NaN   

             Flags  
0  Estimated value  
1              NaN  
2         

In [35]:
# For simplicity and brevity's sake considering Life expectancy
# as the only factor that determines happiness
oecd_bli = oecd_bli[(oecd_bli['Inequality'] == 'Total') &
                    (oecd_bli['Indicator'] == 'Life expectancy')]

print(oecd_bli.shape) # only few number of rows remain. 
print(oecd_bli.head(3))# the total 17 attributes is same as above



(37, 17)
     LOCATION    Country INDICATOR        Indicator MEASURE Measure  \
2373      AUS  Australia    HS_LEB  Life expectancy       L   Value   
2374      AUT    Austria    HS_LEB  Life expectancy       L   Value   
2375      BEL    Belgium    HS_LEB  Life expectancy       L   Value   

     INEQUALITY Inequality Unit Code   Unit  PowerCode Code PowerCode  \
2373        TOT      Total        YR  Years               0     units   
2374        TOT      Total        YR  Years               0     units   
2375        TOT      Total        YR  Years               0     units   

      Reference Period Code  Reference Period  Value Flag Codes Flags  
2373                    NaN               NaN   82.1        NaN   NaN  
2374                    NaN               NaN   81.0        NaN   NaN  
2375                    NaN               NaN   80.5        NaN   NaN  


In [39]:
# Prepare the data
#on=['Country']: This parameter specifies that the merge should be based on the Country column, 
#which exists in both DataFrames. The merge will align rows from gdp_per_capita 
#and oecd_bli where the values in the Country columns are the same.

combined_data = pd.merge(gdp_per_capita, oecd_bli, on=['Country'])   
print(combined_data.shape)
gdp_value = combined_data[['2015']].copy()
bli_value = combined_data[['Value']].copy()

print(gdp_value.shape)
print(bli_value.shape)

(36, 23)
(36, 1)
(36, 1)
        2015
0  50961.865
1  43724.031
2  40106.632
3   8669.998
4  43331.961
   Value
0   82.1
1   81.0
2   80.5
3   73.7
4   81.5


In [40]:
gdp_value.columns = ['GDP per capita']
bli_value.columns = ['Life satisfaction']

country_stats = pd.concat([gdp_value, bli_value], axis=1)

print (country_stats)

X = np.c_[country_stats['GDP per capita']]
y = np.c_[country_stats['Life satisfaction']]

    GDP per capita  Life satisfaction
0        50961.865               82.1
1        43724.031               81.0
2        40106.632               80.5
3         8669.998               73.7
4        43331.961               81.5
5        13340.905               78.9
6        17256.918               78.2
7        52114.165               80.1
8        17288.083               76.5
9        41973.988               80.7
10       37675.006               82.1
11       40996.511               81.0
12       18064.288               80.7
13       12239.894               75.2
14       50854.583               83.0
15       51350.744               81.0
16       35343.336               81.8
17       29866.581               82.3
18       32485.545               83.2
19       27195.197               81.3
20      101994.093               81.5
21        9009.280               74.6
22       43603.115               81.2
23       37044.891               81.5
24       74822.106               81.5
25       124