In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/world-happiness/2019.csv')

In [None]:
# head shows top 5 rows
data.head()

In [None]:
# tail shows last 5 rows
data.tail()

In [None]:
# columns gives names of features
data.columns

In [None]:
# shape gives number of rows and columns in a tuble
data.shape

In [None]:
# Info gives data type like dataFrame, number of sample or row, number of feature or column, feature types and memory usage
data.info()


# EXPLORATORY DATA ANALYSIS

<br> value_counts(): Frequency counts
<br> Outliers: the value that is considerably higher or lower from rest of the data

* Lets say value at 75% is Q3 and value at 25% is Q1
* Outlier are smaller than Q1 - 1.5(Q3-Q1) and bigger than Q3 + 1.5(Q3-Q1). (Q3-Q1) = IQR
<br> We will use decribe() method. Describe method includes:
* count: Number of entries
* mean: average of entries
* std: standard deviation
* min: minimum entry
* 25%: firts quantile
* 50%: madian or second quantile
* 75%: third quantile
* max: maximum entry


**What is quantile?**
* 1,4,5,6,8,9,10,11,12,13,14,15,16,17
* The median is the numer that is in middle of the sequence. In this case it would be 11.
* The lower quartile is the median in between 1 and 11, which is 6.
* The upper quartile, is the median in between 11 and 17 which is 14



In [None]:
#lets look frequency of world-happiness
print(data['Country or region'].value_counts(dropna = False)) # if there are nan values that also be counted

In [None]:
data.describe() # ignore null entries

# VISUAL EXPLORATORY DATA ANALYSIS

* Box plots: visualize basic statistics like outliers, min/max or quantiles

In [None]:
# For example: compare  Freedom to make life choices  of world happiness that are   Social support or not
# black line at top is max
# blue line at top is 75%
# green line is median (50%)
# Blue line at bottom is 25%
# black line at bottom is min
data.boxplot(column = 'Generosity', by = 'Freedom to make life choices')

# TIDY DATA

<br> We tidy data with melt().

In [None]:
#Firstly create new data from world hapiness dta to explain melt nore easily.
data_new = data.head() # I only take 5 rows into new data
data_new

In [None]:
#lets melt
#id_vars = what we do not wish to melt
# value_vars = what we want to melt
melted = pd.melt(frame = data_new, id_vars = 'Country or region', value_vars=['Score','Social support'])
melted

# PIVOTING DATA
<br> Reverse of melting.

In [None]:
# Index is name
# I want to make that columns are variable
# Finaly values in columns are value
melted.pivot(index = 'Country or region', columns = 'variable', values = 'value')

# CONCATENATING DATA
<br> We can concatenate two dataframe

In [None]:
#Firstly lets create 2 data frame
data1 = data.head()
data2 = data.tail()
conc_data_row = pd.concat([data1,data2], axis = 0, ignore_index = True) # axis = 0 : adds dataframes in row
conc_data_row

# DATA TYPES

<br>There are 5 basic data types: object(String), boolean, integer, float and categorical.
<br>We can make conversation data types like from str to categorical or from to float why is categorical important :
*  make dataframe smaller in memory
*  can be utilized for analysis especially for sklearn


In [None]:
data.dtypes

In [None]:
# Lets convert object(str) to categorical 
# and int to float
data['Country or region'] = data['Country or region'].astype('category')
data['Generosity'] = data['Generosity'].astype('int')

In [None]:
data.dtypes

# MISSING DATA and TESTING WITH ASSERT

If we encounter wıth missing data, hat we can do:
* Leave as is
* drop them with dropna()
* fill missing value with fillna()
* fill missing values with test statistics like mean
<br> Assert statement: check that you can turn on or turn off when you are done with yout testing or the program

In [None]:
#Lets look at does world hapiness data have nan value
data.info()

In [None]:
#Lets check Generosity
data['Generosity'].value_counts(dropna = False)

In [None]:
# Lets drop nan values
data1=data   # also we will use data to fill missing value so I assign it to data1 variable
data1["Generosity"].dropna(inplace = True)  # inplace = True means we do not assign it to new variable. Changes automatically assigned to data
# So does it work ?

In [None]:
#Lets check with assert statement
#Assert statement:
assert 1==1 # return nothing because it is true

In [None]:
#In order to run all code, we need to make this line comment
#assert 1==2 # return error because it is false

In [None]:
assert  data['Generosity'].notnull().all() # returns nothing because we drop nan values

In [None]:
data["Generosity"].fillna('empty',inplace = True)

In [None]:
assert  data['Generosity'].notnull().all() # returns nothing because we do not have nan values

In [None]:
# # With assert statement we can check a lot of thing. For example
#assert data.columns[1] == 'Country or region'
#assert data.Score.dtypes == np.float64