# PROJECT ANALISIS DATA - AIR POLLUTION CITY IN CHINA

Pertanyaan:
1. Top 5 Kota dengan tingkat polutan SO2, NO2, CO, O3 tertinggi dan terburuk
2. Bagaimana Trend PM2.5 dan PM10 dari 01/03/2013 sampai 28/02/2017
3. Bagaimana Pengaruh PM 2.5 dan PM10 dengan parameter cuaca seperti, Temperatur, Pressure, Titik embun, Tingkat Hujan dan Kecepatan angin?

--------------------------------------------------------------------------------------------------------------------------------------------------------
# Data Wrangling
----


## Gathering Data
---

In [227]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

aotizhongxin_df = pd.read_csv('Data/PRSA_Data_Aotizhongxin_20130301-20170228.csv')
changping_df = pd.read_csv('Data/PRSA_Data_Changping_20130301-20170228.csv')
dingling_df = pd.read_csv('Data/PRSA_Data_Dingling_20130301-20170228.csv')
dongsi_df = pd.read_csv('Data/PRSA_Data_Dongsi_20130301-20170228.csv')
guanyuan_df = pd.read_csv('Data/PRSA_Data_Guanyuan_20130301-20170228.csv')
gucheng_df = pd.read_csv('Data/PRSA_Data_Gucheng_20130301-20170228.csv')
huairou_df = pd.read_csv('Data/PRSA_Data_Huairou_20130301-20170228.csv')
nongzhanguan_df = pd.read_csv('Data/PRSA_Data_Nongzhanguan_20130301-20170228.csv')
shunyi_df = pd.read_csv('Data/PRSA_Data_Shunyi_20130301-20170228.csv')
tiantan_df = pd.read_csv('Data/PRSA_Data_Tiantan_20130301-20170228.csv')
wanliu_df = pd.read_csv('Data/PRSA_Data_Wanliu_20130301-20170228.csv')
wanshouxigong_df = pd.read_csv('Data/PRSA_Data_Wanshouxigong_20130301-20170228.csv')

all_data_df = pd.concat([
    aotizhongxin_df,
    changping_df,
    dingling_df,
    dongsi_df,
    guanyuan_df,
    gucheng_df,
    huairou_df,
    nongzhanguan_df,
    shunyi_df,
    tiantan_df,
    wanliu_df,
    wanshouxigong_df
], ignore_index=True)

#Membuat Format tanggal 
all_data_df.insert(1, 'Date',pd.to_datetime(
    all_data_df[['year', 'month', 'day', 'hour']].astype(str).agg('-'.join, axis=1),
    format='%Y-%m-%d-%H'
))
all_data_df = all_data_df.drop(columns=['year', 'month', 'day', 'hour'])


## Accessing Data
- List Data Hilang [PM2.5, PM10, SO2, NO2, CO, O3, TEMP, PRES, DEWP, RAIN, wd, WSPM]
- Tidak ada Duplikat
- Tidak ada innaccurate value
---

In [219]:
all_data_df.head()
all_data_df.info()
all_data_df.isna().sum() 
print(f"Jumlah Duplikat: {all_data_df.duplicated().sum()}") 
all_data_df.describe() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420768 entries, 0 to 420767
Data columns (total 15 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   No       420768 non-null  int64         
 1   Date     420768 non-null  datetime64[ns]
 2   PM2.5    412029 non-null  float64       
 3   PM10     414319 non-null  float64       
 4   SO2      411747 non-null  float64       
 5   NO2      408652 non-null  float64       
 6   CO       400067 non-null  float64       
 7   O3       407491 non-null  float64       
 8   TEMP     420370 non-null  float64       
 9   PRES     420375 non-null  float64       
 10  DEWP     420365 non-null  float64       
 11  RAIN     420378 non-null  float64       
 12  wd       418946 non-null  object        
 13  WSPM     420450 non-null  float64       
 14  station  420768 non-null  object        
dtypes: datetime64[ns](1), float64(11), int64(1), object(2)
memory usage: 48.2+ MB
Jumlah Duplikat: 0


Unnamed: 0,No,Date,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM
count,420768.0,420768,412029.0,414319.0,411747.0,408652.0,400067.0,407491.0,420370.0,420375.0,420365.0,420378.0,420450.0
mean,17532.5,2015-03-01 11:30:00.000001024,79.793428,104.602618,15.830835,50.638586,1230.766454,57.372271,13.538976,1010.746982,2.490822,0.064476,1.729711
min,1.0,2013-03-01 00:00:00,2.0,2.0,0.2856,1.0265,100.0,0.2142,-19.9,982.4,-43.4,0.0,0.0
25%,8766.75,2014-03-01 05:45:00,20.0,36.0,3.0,23.0,500.0,11.0,3.1,1002.3,-8.9,0.0,0.9
50%,17532.5,2015-03-01 11:30:00,55.0,82.0,7.0,43.0,900.0,45.0,14.5,1010.4,3.1,0.0,1.4
75%,26298.25,2016-02-29 17:15:00,111.0,145.0,20.0,71.0,1500.0,82.0,23.3,1019.0,15.1,0.0,2.2
max,35064.0,2017-02-28 23:00:00,999.0,999.0,500.0,290.0,10000.0,1071.0,41.6,1042.8,29.1,72.5,13.2
std,10122.116943,,80.822391,91.772426,21.650603,35.127912,1160.182716,56.661607,11.436139,10.474055,13.793847,0.821004,1.246386


## Cleaning Data
List data yang hilang 
- PM2.5 = 8739
- PM10 = 6449
- SO2 = 9021
- NO2 = 12116
- CO = 20701
- O3 = 13277
- TEMP = 398
- PRES = 393
- DEWP = 403
- RAIN = 390
- wd = 1822
- WSPM = 318
---

In [231]:
missing_value = ["PM2.5", "PM10", "SO2", "NO2", "CO", "O3", "TEMP", "PRES", "DEWP", "RAIN", "wd", "WSPM"]
for val in missing_value:
    all_data_df[all_data_df[val].isna()]
    mode_value = all_data_df[val].mode()[0]
    all_data_df[val] = all_data_df[val].fillna(mode_value)
    
all_data_df.isna().sum()
all_data_df.describe()

Unnamed: 0,No,Date,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM
count,420768.0,420768,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0,420768.0
mean,17532.5,2015-03-01 11:30:00.000001024,78.198493,103.091361,15.53431,49.641169,1184.974482,55.625043,13.529007,1010.75469,2.505293,0.064416,1.729235
min,1.0,2013-03-01 00:00:00,2.0,2.0,0.2856,1.0265,100.0,0.2142,-19.9,982.4,-43.4,0.0,0.0
25%,8766.75,2014-03-01 05:45:00,19.0,34.0,2.0,21.0,400.0,8.0,3.1,1002.3,-8.9,0.0,0.9
50%,17532.5,2015-03-01 11:30:00,53.0,81.0,7.0,42.0,800.0,43.0,14.5,1010.4,3.1,0.0,1.4
75%,26298.25,2016-02-29 17:15:00,109.0,144.0,19.0,70.0,1500.0,80.0,23.2,1019.0,15.1,0.0,2.2
max,35064.0,2017-02-28 23:00:00,999.0,999.0,500.0,290.0,10000.0,1071.0,41.6,1042.8,29.1,72.5,13.2
std,10122.116943,,80.725003,91.868512,21.510746,35.099748,1149.054714,56.594407,11.43532,10.472197,13.795159,0.820626,1.246035


# Exploratory Data Analysis
---