# Preprocessing data untuk mendapatkan model regresi terbaik

In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv('Life Expectancy.csv')
data

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Income composition of resources,Schooling
0,65.0,263.0,62,0.479,10.1
1,59.9,271.0,64,0.476,10.0
2,59.9,268.0,66,0.470,9.9
3,59.5,272.0,69,0.463,9.8
4,59.2,275.0,71,0.454,9.5
5,58.8,279.0,74,0.448,9.2
6,58.6,281.0,77,0.434,8.9
7,58.1,287.0,80,0.433,8.7
8,57.5,295.0,82,0.415,8.4
9,57.3,295.0,84,0.405,8.1


In [3]:
data.describe()

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Income composition of resources,Schooling
count,2928.0,2928.0,2938.0,2771.0,2775.0
mean,69.224932,164.796448,30.303948,0.627551,12.026667
std,9.523867,124.292079,117.926501,0.210904,3.951371
min,36.3,1.0,0.0,0.0,0.0
25%,63.1,74.0,0.0,0.493,10.1
50%,72.1,144.0,3.0,0.677,12.3
75%,75.7,228.0,22.0,0.779,14.3
max,89.0,723.0,1800.0,0.948,121.0


In [4]:
np.sum(data.isnull())

Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Income composition of resources    167
Schooling                          163
dtype: int64

### A. Mengatasi Missing Value

In [5]:
data['Life expectancy']=data['Life expectancy'].fillna((data['Life expectancy'].mean()))
data['Adult Mortality']=data['Adult Mortality'].fillna((data['Adult Mortality'].mean()))
data['Income composition of resources']=data['Income composition of resources'].fillna((data['Income composition of resources'].mean()))
data['Schooling']=data['Schooling'].fillna((data['Schooling'].mean()))
np.sum(data.isnull())

Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Income composition of resources    0
Schooling                          0
dtype: int64

In [6]:
data_hapus_missing_value = data
print(data_hapus_missing_value)

      Life expectancy  Adult Mortality  infant deaths  \
0                65.0            263.0             62   
1                59.9            271.0             64   
2                59.9            268.0             66   
3                59.5            272.0             69   
4                59.2            275.0             71   
5                58.8            279.0             74   
6                58.6            281.0             77   
7                58.1            287.0             80   
8                57.5            295.0             82   
9                57.3            295.0             84   
10               57.3            291.0             85   
11               57.0            293.0             87   
12               56.7            295.0             87   
13               56.2              3.0             88   
14               55.3            316.0             88   
15               54.8            321.0             88   
16               77.8          

In [7]:
data_hapus_missing_value.to_excel('A. Data Life Expectancy tanpa missing value.xlsx')

### B. Mengatasi Missing Value dan Outlier

In [8]:
from scipy import stats
z = np.abs(stats.zscore(data_hapus_missing_value._get_numeric_data()))
print(z)
data_hapus_missing_value_dan_outlier = data_hapus_missing_value[(z < 3).all(axis=1)]
print(data_hapus_missing_value_dan_outlier.shape)

[[0.44444792 0.79158632 0.26882378 0.72540055 0.5018009 ]
 [0.98094995 0.85607167 0.28578638 0.74005007 0.52784593]
 [0.98094995 0.83188966 0.30274898 0.7693491  0.55389096]
 ...
 [2.56941673 0.73994077 0.04498439 0.97932554 0.52784593]
 [2.5168185  4.20124926 0.04498439 0.97932554 0.57993599]
 [2.44318096 4.03197521 0.05346569 0.94514333 0.57993599]]
(2731, 5)


In [9]:
data_hapus_missing_value_dan_outlier.to_excel('B. Data Life Expectancy tanpa missing value dan outlier.xlsx')

### C. Mengatasi Missing Value dan Melakukan Transformasi Data

In [10]:
from sklearn.preprocessing import normalize
norm = normalize(data_hapus_missing_value, norm = 'l2')
norm = pd.DataFrame(norm)
norm.tail()

Unnamed: 0,0,1,2,3,4
2933,0.06111,0.997355,0.037246,0.000561,0.012691
2934,0.062071,0.997324,0.036266,0.000583,0.013251
2935,0.498974,0.81306,0.278445,0.004756,0.111378
2936,0.065841,0.997066,0.036336,0.000621,0.014244
2937,0.068956,0.996862,0.035977,0.000651,0.014691


In [11]:
transformasi_sigmoidal = (1-2.718281828**(-norm))/(1+2.718281828**(-norm))
transformasi_sigmoidal = pd.DataFrame(transformasi_sigmoidal)
transformasi_sigmoidal.tail()

Unnamed: 0,0,1,2,3,4
2933,0.030546,0.461076,0.018621,0.000281,0.006345
2934,0.031026,0.461064,0.018131,0.000292,0.006625
2935,0.244436,0.385522,0.13833,0.002378,0.055632
2936,0.032909,0.460963,0.018166,0.00031,0.007122
2937,0.034464,0.460882,0.017987,0.000325,0.007345


In [12]:
transformasi_sigmoidal.to_excel('C. Data Life Expectancy tanpa missing value dan transformasi.xlsx')

### D. Mengatasi Missing Value, Outlier, dan Melakukan Transformasi Data

In [13]:
from sklearn.preprocessing import normalize
norm2 = normalize(data_hapus_missing_value_dan_outlier, norm = 'l2')
norm2 = pd.DataFrame(norm2)
norm2.tail()

Unnamed: 0,0,1,2,3,4
2726,0.117262,0.991063,0.059806,0.000965,0.021573
2727,0.098777,0.993428,0.054667,0.000822,0.018851
2728,0.534754,0.768852,0.332787,0.004751,0.110164
2729,0.831026,0.128132,0.512527,0.007468,0.173893
2730,0.498974,0.81306,0.278445,0.004756,0.111378


In [14]:
transformasi_sigmoidal2 = (1-2.718281828**(-norm2))/(1+2.718281828**(-norm2))
transformasi_sigmoidal2 = pd.DataFrame(transformasi_sigmoidal2)
transformasi_sigmoidal2.tail()

Unnamed: 0,0,1,2,3,4
2726,0.058564,0.458596,0.029894,0.000483,0.010786
2727,0.049349,0.459529,0.027327,0.000411,0.009425
2728,0.261182,0.366545,0.164874,0.002375,0.055026
2729,0.393144,0.063978,0.250797,0.003734,0.086728
2730,0.244436,0.385522,0.13833,0.002378,0.055632


In [15]:
transformasi_sigmoidal2.to_excel('D. Data Life Expectancy tanpa missing value, outlier dan transformasi.xlsx')