In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("AQI and Lat Long of Countries.csv")

## ****DATA SPLIT****

#### Pembagian dataset dengan training set (70%) dan Testing set (30%)

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
x = df.drop('AQI Category', axis=1)
y = df['AQI Category']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=20)

In [5]:
print("Dimensi x_train = ", x_train.shape)
print("Dimensi x_test = ", x_test.shape)
print("Dimensi y_train = ", y_train.shape)
print("Dimensi y_test = ", y_test.shape)

Dimensi x_train =  (16675, 13)
Dimensi x_test =  (20, 13)
Dimensi y_train =  (16675,)
Dimensi y_test =  (20,)


## ****NORMALISASI****

#### Fungsi **MinMaxScaler** Digunakan Untuk Melakukan Normalisasi

In [6]:
from sklearn.preprocessing import MinMaxScaler

##### Disini Saya Melakukan Normalisasi Pada Kolom **AQI Value**

In [7]:
df_1 = df.copy()

scaler = MinMaxScaler()

scaled_data = scaler.fit_transform(df_1[['AQI Value']])
df_1['AQI Value'] = scaled_data

##### **Hasil Normalisasi**

In [8]:
print("Sebelum : ")
print(df['AQI Value'].describe())
print('\n', '\n')
print("Sesudah : ")
print(df_1['AQI Value'].describe())

Sebelum : 
count    16695.000000
mean        62.998682
std         43.091971
min          7.000000
25%         38.500000
50%         52.000000
75%         69.000000
max        500.000000
Name: AQI Value, dtype: float64

 

Sesudah : 
count    16695.000000
mean         0.113588
std          0.087408
min          0.000000
25%          0.063895
50%          0.091278
75%          0.125761
max          1.000000
Name: AQI Value, dtype: float64


## ****STANDARISASI****

#### Fungsi **StandardScaler** Digunakan Untuk Melakukan Standarisasi

In [9]:
from sklearn.preprocessing import StandardScaler

##### Disini Saya Melakukan Standarisasi Pada Kolom **AQI Value**

In [10]:
df_2 = df.copy()

scaler = StandardScaler()

scaled_data = scaler.fit_transform(df_2[['AQI Value']])
df_2['AQI Value'] = scaled_data

##### **Hasil Standarisasi**

In [11]:
print("Sebelum : ")
print(np.std(df['AQI Value']))
print('\n')
print("Sesudah : ")
print(np.std(df_2['AQI Value']))

Sebelum : 
43.090680046373095


Sesudah : 
1.0000000000000002


## ****MENANGANI NILAI NULL****

In [12]:
df_NotNull = df.copy()

#### Membuat Kolom **NO2 AQI Value** Menjadi Nilai Null ***(Int)***

In [13]:
df_NotNull.loc[12000:, 'PM2.5 AQI Value'] = np.nan
df.loc[12000:, 'PM2.5 AQI Value'] = np.nan

print("Nilai Null Pada Kolom   : ")
print(df_NotNull['PM2.5 AQI Value'].isna().sum())

Nilai Null Pada Kolom   : 
4695


#### Menangani Nilai Null Pada Kolom **PM2.5 AQI Value** Bertipe ***INT*** Dengan Strategy ***MEDIAN***

In [14]:
df_NotNull["PM2.5 AQI Value"].fillna(df_NotNull["PM2.5 AQI Value"].median(), inplace= True)

#### Menangani Nilai Null Pada Kolom **Country** Bertipe ***OBJECT*** Dengan Strategy ***MODUS***

In [15]:
df_NotNull["Country"].fillna(df_NotNull["Country"].mode()[0], inplace= True)

##### Memperlihatkan Nilai Null dan tidak Null

###### **BERNILAI NULL**

In [16]:
df.isna().sum()

Country                302
City                     0
AQI Value                0
AQI Category             0
CO AQI Value             0
CO AQI Category          0
Ozone AQI Value          0
Ozone AQI Category       0
NO2 AQI Value            0
NO2 AQI Category         0
PM2.5 AQI Value       4695
PM2.5 AQI Category       0
lat                      0
lng                      0
dtype: int64

###### **TIDAK BERNILAI NULL**

In [17]:
df_NotNull.isna().sum()

Country               0
City                  0
AQI Value             0
AQI Category          0
CO AQI Value          0
CO AQI Category       0
Ozone AQI Value       0
Ozone AQI Category    0
NO2 AQI Value         0
NO2 AQI Category      0
PM2.5 AQI Value       0
PM2.5 AQI Category    0
lat                   0
lng                   0
dtype: int64

## ****MENANGANI NILAI DUPLIKAT****

##### Kita cek apakah ada nilai duplikat

In [18]:
print("Nilai Duplikat: ")
df.duplicated().sum()

Nilai Duplikat: 


0

##### karena tidak ada nilai duplikat, kita buat ada nilai duplikat

In [27]:
df_NotNull.iloc[1000 ,:] = df_NotNull.iloc[12000,:]

print("Nilai duplikat : ")
df_NotNull.duplicated().sum()

Nilai duplikat : 


2

##### terus mengatasi nilai duplikat tersebut

In [28]:
df_NotNull.drop_duplicates(inplace=True)

print("Nilai duplikat : ")
df_NotNull.duplicated().sum()

Nilai duplikat : 


0

### ****MENGGANTI TIPE DATA SALAH SATU ATRIBUT BERTIPE ANGKA****

##### Melihat Atribut Bertipe Angka Disini Saya Akan Mengambil Atribut ***AQI Value*** 

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16695 entries, 0 to 16694
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Country             16393 non-null  object 
 1   City                16695 non-null  object 
 2   AQI Value           16695 non-null  int64  
 3   AQI Category        16695 non-null  object 
 4   CO AQI Value        16695 non-null  int64  
 5   CO AQI Category     16695 non-null  object 
 6   Ozone AQI Value     16695 non-null  int64  
 7   Ozone AQI Category  16695 non-null  object 
 8   NO2 AQI Value       16695 non-null  int64  
 9   NO2 AQI Category    16695 non-null  object 
 10  PM2.5 AQI Value     12000 non-null  float64
 11  PM2.5 AQI Category  16695 non-null  object 
 12  lat                 16695 non-null  float64
 13  lng                 16695 non-null  float64
dtypes: float64(3), int64(4), object(7)
memory usage: 1.8+ MB


**SEBELUM**

In [22]:
df_NotNull["AQI Value"].dtypes

dtype('int64')

**SESUDAH**

In [23]:
df_NotNull["AQI Value"] = df_NotNull["AQI Value"].astype(float)

df_NotNull["AQI Value"].dtypes

dtype('float64')

# ****MELAKUKAN ONE HOT ENCODING****

In [24]:
Hot = pd.get_dummies(df[["City"]])
df_Hot = df.join(Hot)

**Menampilkan Hasil One Hot Encoding**

In [25]:
df_Hot

Unnamed: 0,Country,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,...,City_Zvishavane,City_Zvolen,City_Zwedru,City_Zwettl,City_Zwevegem,City_Zwickau,City_Zwiesel,City_Zwijndrecht,City_Zwolle,City_Zyryanovsk
0,Russian Federation,Praskoveya,51,Moderate,1,Good,36,Good,0,Good,...,0,0,0,0,0,0,0,0,0,0
1,Brazil,Presidente Dutra,41,Good,1,Good,5,Good,1,Good,...,0,0,0,0,0,0,0,0,0,0
2,Brazil,Presidente Dutra,41,Good,1,Good,5,Good,1,Good,...,0,0,0,0,0,0,0,0,0,0
3,Italy,Priolo Gargallo,66,Moderate,1,Good,39,Good,2,Good,...,0,0,0,0,0,0,0,0,0,0
4,Poland,Przasnysz,34,Good,1,Good,34,Good,0,Good,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16690,United States of America,Highland Springs,54,Moderate,1,Good,34,Good,5,Good,...,0,0,0,0,0,0,0,0,0,0
16691,Slovakia,Martin,71,Moderate,1,Good,39,Good,1,Good,...,0,0,0,0,0,0,0,0,0,0
16692,Slovakia,Martin,71,Moderate,1,Good,39,Good,1,Good,...,0,0,0,0,0,0,0,0,0,0
16693,France,Sceaux,50,Good,1,Good,20,Good,5,Good,...,0,0,0,0,0,0,0,0,0,0
