## Tugas Pertemuan 4
***5210411135*** - I Gede Widiantara

### Import all packages

In [332]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

### Read File CSV

In [333]:
rawData = pd.read_csv('D:\Machine Learning/5210411135\data/tugas_pert4.csv')

# mengubah urutan kolom (mempermudah slicing features)
rawData = rawData[['jenis_sekolah', 'tingkat_sekolah', ' jumlah_sekolah ', ' jumlah_guru ', ' jumlah_murid ', ' jumlah_kelas ']]

rawData.head()

Unnamed: 0,jenis_sekolah,tingkat_sekolah,jumlah_sekolah,jumlah_guru,jumlah_murid,jumlah_kelas
0,Negeri,TK,9.0,58,521.0,35
1,Swasta,TK,1781.0,8646,92775.0,5179
2,Negeri,TK,9.0,58,673.0,35
3,Swasta,TK,1432.0,10064,81050.0,4524
4,,TK,9.0,103,836.0,35


### Cek Dimensi & Missing Value

In [334]:
print(rawData.shape)
print(rawData.isnull().sum())

(50, 6)
jenis_sekolah       1
tingkat_sekolah     0
 jumlah_sekolah     1
 jumlah_guru        0
 jumlah_murid       1
 jumlah_kelas       0
dtype: int64


### Fill Missing Values

#### Manual

In [335]:
# fill missing value manual (data numerik)
rawData.loc[4, 'jenis_sekolah'] = 'Negeri'
rawData.isnull().sum()

jenis_sekolah       0
tingkat_sekolah     0
 jumlah_sekolah     1
 jumlah_guru        0
 jumlah_murid       1
 jumlah_kelas       0
dtype: int64

#### Simple Imputer

In [336]:
# fill missing value SimpleImputer (data nominal)
nominal = rawData.iloc[:,2:].values

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(nominal)

nominal = imp.transform(nominal)
rawData[[' jumlah_sekolah ',' jumlah_guru ', ' jumlah_murid ', ' jumlah_kelas ']] = nominal
rawData.isnull().sum()

jenis_sekolah       0
tingkat_sekolah     0
 jumlah_sekolah     0
 jumlah_guru        0
 jumlah_murid       0
 jumlah_kelas       0
dtype: int64

### Encoding Data Numeric

In [337]:
# membuat data numeric
numData = rawData.copy()

lbenc = LabelEncoder()
for i in numData.columns.values:
    if numData[i].dtypes == 'object':
        numData[i] = lbenc.fit_transform(numData[i])

numData.head()

Unnamed: 0,jenis_sekolah,tingkat_sekolah,jumlah_sekolah,jumlah_guru,jumlah_murid,jumlah_kelas
0,0,4,9.0,58.0,521.0,35.0
1,1,4,1781.0,8646.0,92775.0,5179.0
2,0,4,9.0,58.0,673.0,35.0
3,1,4,1432.0,10064.0,81050.0,4524.0
4,0,4,9.0,103.0,836.0,35.0


### Split Data Training & Data Testing

In [338]:
dataTT = numData.copy()

# Menentukan Label & Features
lb_y = dataTT.iloc[:,0].values
ft_x = dataTT.iloc[:,1:].values

# split
x_train, x_test, y_train, y_test = train_test_split(ft_x, lb_y, test_size=0.3, random_state=0)

dimensi = f'dimensi awal\t: {dataTT.shape}\ndimensi x_train\t: {x_train.shape}\ndimensi x_test\t: {x_test.shape}\ndimensi y_train\t: {y_train.shape}\ndimensi y_test\t: {y_test.shape}\n'
print(dimensi)


dimensi awal	: (50, 6)
dimensi x_train	: (35, 5)
dimensi x_test	: (15, 5)
dimensi y_train	: (35,)
dimensi y_test	: (15,)



### Normalisasi

In [345]:
# pisahkan data numeric & nominal pada features
data_x = x_train.copy()
ft_nominal = data_x[:,1:]

# call minmaxscaler
mnmx = MinMaxScaler()
ft_nominal = mnmx.fit_transform(ft_nominal)

# menggabungkan kembali features
minmax_x = np.concatenate((data_x[:,0:1], ft_nominal), axis=1)
print(minmax_x) 

# membuat dataframe data hasil normalisasi
label_y = y_train.copy().reshape(-1,1) # ubah label menjadi array 2D
data = np.concatenate((label_y, minmax_x), axis=1)
cols = numData.columns.values
normalisasi = pd.DataFrame(data=data, columns=cols)
normalisasi.head()

[[4.00000000e+00 8.95164934e-01 3.56964108e-01 1.32119750e-01
  3.15679019e-01]
 [0.00000000e+00 9.93673746e-01 9.67090604e-01 9.85962739e-01
  1.00000000e+00]
 [2.00000000e+00 2.35427022e-01 4.21720202e-01 2.27670200e-01
  2.44601535e-01]
 [2.00000000e+00 2.44012653e-02 1.02190532e-01 6.33125163e-02
  6.30143039e-02]
 [3.00000000e+00 3.57433348e-01 4.10853245e-01 2.12885620e-01
  3.57044237e-01]
 [0.00000000e+00 3.75056484e-01 4.74306674e-01 3.19795312e-01
  4.68824212e-01]
 [1.00000000e+00 4.88025305e-02 2.37598985e-01 1.42087592e-01
  1.40332468e-01]
 [1.00000000e+00 4.88025305e-02 2.10380172e-01 1.34099759e-01
  1.30502016e-01]
 [0.00000000e+00 9.49841844e-01 9.27153680e-01 9.54284138e-01
  9.48970012e-01]
 [2.00000000e+00 2.44012653e-02 1.01676322e-01 6.28918607e-02
  6.30143039e-02]
 [3.00000000e+00 1.26073204e-01 4.44071167e-01 3.40892807e-01
  2.56144033e-01]
 [2.00000000e+00 2.31812020e-01 4.28439203e-01 2.39937010e-01
  2.45540399e-01]
 [4.00000000e+00 5.87437867e-03 2.193959

Unnamed: 0,jenis_sekolah,tingkat_sekolah,jumlah_sekolah,jumlah_guru,jumlah_murid,jumlah_kelas
0,1.0,4.0,0.895165,0.356964,0.13212,0.315679
1,0.0,0.0,0.993674,0.967091,0.985963,1.0
2,1.0,2.0,0.235427,0.42172,0.22767,0.244602
3,0.0,2.0,0.024401,0.102191,0.063313,0.063014
4,1.0,3.0,0.357433,0.410853,0.212886,0.357044


### Standarisasi

In [346]:
# pisahkan data numeric & nominal pada features
dt_x = x_train.copy()
ft_nom = dt_x[:,1:]

# call standardscaler
stsc = StandardScaler()
ft_nom = stsc.fit_transform(ft_nom)

# menggabungkan kembali features
stn_x = np.concatenate((dt_x[:,0:1], ft_nom), axis=1)
print(stn_x) 

# membuat dataframe data hasil standarisasi
label_y = y_train.copy().reshape(-1,1) # ubah label menjadi array 2D
dt = np.concatenate((label_y, stn_x), axis=1)
colm = numData.columns.values
standarisasi = pd.DataFrame(data=dt, columns=colm)
standarisasi.head()

[[ 4.00000000e+00  1.77802208e+00  2.60453719e-02 -4.32511614e-01
   1.22010561e-01]
 [ 0.00000000e+00  2.08618619e+00  2.58591339e+00  3.04823348e+00
   2.82737128e+00]
 [ 2.00000000e+00 -2.85829277e-01  2.97738297e-01 -4.29941445e-02
  -1.58983647e-01]
 [ 2.00000000e+00 -9.45978992e-01 -1.04289154e+00 -7.13008664e-01
  -8.76861626e-01]
 [ 3.00000000e+00  9.58418644e-02  2.52144514e-01 -1.03264421e-01
   2.85541767e-01]
 [ 0.00000000e+00  1.50972140e-01  5.18371938e-01  3.32559716e-01
   7.27447189e-01]
 [ 1.00000000e+00 -8.69644763e-01 -4.74767104e-01 -3.91877073e-01
  -5.71195820e-01]
 [ 1.00000000e+00 -8.69644763e-01 -5.88967306e-01 -4.24439982e-01
  -6.10059044e-01]
 [ 0.00000000e+00  1.94906729e+00  2.41835264e+00  2.91909365e+00
   2.62563184e+00]
 [ 2.00000000e+00 -9.45978992e-01 -1.04504897e+00 -7.14723493e-01
  -8.76861626e-01]
 [ 3.00000000e+00 -6.27919707e-01  3.91514786e-01  4.18564992e-01
  -1.13352109e-01]
 [ 2.00000000e+00 -2.97138051e-01  3.25928776e-01  7.01228296e-03

Unnamed: 0,jenis_sekolah,tingkat_sekolah,jumlah_sekolah,jumlah_guru,jumlah_murid,jumlah_kelas
0,1.0,4.0,1.778022,0.026045,-0.432512,0.122011
1,0.0,0.0,2.086186,2.585913,3.048233,2.827371
2,1.0,2.0,-0.285829,0.297738,-0.042994,-0.158984
3,0.0,2.0,-0.945979,-1.042892,-0.713009,-0.876862
4,1.0,3.0,0.095842,0.252145,-0.103264,0.285542
