# Pandas

Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language. More informations [https://pandas.pydata.org/](https://pandas.pydata.org/). Options to download & install pandas:

- From Anaconda (no installation required)- 
From Miniconda: `$ conda create -c conda-forge -n {your_env} python pandas`- 
From PyPI: `$ pip install pandas`ndas


In [5]:
import pandas as pd

# Series

In [3]:
mahasiswa = ["Andi", "Budi", "Caca"]

In [4]:
mahasiswa

['Andi', 'Budi', 'Caca']

In [7]:
# 1. create series from list
sMahasiswa = pd.Series(mahasiswa)

In [8]:
sMahasiswa

0    Andi
1    Budi
2    Caca
dtype: object

In [9]:
sMahasiswa[0]

'Andi'

In [10]:
sMahasiswa[2]

'Caca'

In [11]:
sMahasiswa[1:3]

1    Budi
2    Caca
dtype: object

In [12]:
sMahasiswa[0:3:2]

0    Andi
2    Caca
dtype: object

In [13]:
type(sMahasiswa)

pandas.core.series.Series

In [14]:
type(mahasiswa)

list

In [15]:
# 2. create series from tuple
mahasiswa = ("Andi", "Budi", "Caca")
sMahasiswa = pd.Series(mahasiswa)
sMahasiswa

0    Andi
1    Budi
2    Caca
dtype: object

In [20]:
# 3. create series from dictionary
mahasiswa = {0:"Andi", 1:"Budi", 2:"Caca"}
sMahasiswa = pd.Series(mahasiswa)
sMahasiswa

0    Andi
1    Budi
2    Caca
dtype: object

In [21]:
# 4. create series from scalar value
mahasiswa = "Andi"
sMahasiswa = pd.Series(mahasiswa)
sMahasiswa

0    Andi
dtype: object

In [25]:
# 5. create series using name series
sMahasiswa = pd.Series(
    data = ("Andi", "Budi", "Caca"),
    name = 'FirstName'
)
sMahasiswa

0    Andi
1    Budi
2    Caca
Name: FirstName, dtype: object

In [26]:
sMahasiswa.name

'FirstName'

### Series Method

In [34]:
# data view
sMahasiswa = pd.Series(["Andi", "Budi", "Andi"])
print(sMahasiswa.shape) # (3,) dimensi 1, 3 baris data
print(sMahasiswa.describe())

(3,)
count        3
unique       2
top       Andi
freq         2
dtype: object


In [40]:
sMahasiswa = pd.Series([12, 13, 12, 11, 10, 9])
print(sMahasiswa.describe())
print(sMahasiswa.describe()['count'])
print(sMahasiswa.describe()['min'])
print(sMahasiswa.describe()['max'])
print(sMahasiswa.describe()['std'])
print(sMahasiswa.describe()['50%'])

count     6.000000
mean     11.166667
std       1.471960
min       9.000000
25%      10.250000
50%      11.500000
75%      12.000000
max      13.000000
dtype: float64
6.0
9.0
13.0
1.4719601443879746
11.5


In [48]:
# series method 
print(sMahasiswa.max())
print(sMahasiswa.min())
print(sMahasiswa.mean())
print(sMahasiswa.median())
print(sMahasiswa.mode()[0])
print(sMahasiswa.std())

13
9
11.166666666666666
11.5
12
1.4719601443879746


In [50]:
list(sMahasiswa)

[12, 13, 12, 11, 10, 9]

In [51]:
sMahasiswa.tolist()

[12, 13, 12, 11, 10, 9]

# DataFrame

In [55]:
# 1. create dataframe from list
x = [1, 2, 3, 4, 5]
dfX = pd.DataFrame(x, columns=['Kolom X'], index=list('abcde'))
dfX

Unnamed: 0,Kolom X
a,1
b,2
c,3
d,4
e,5


In [56]:
# 2. create dataframe from tuple
x = (1, 2, 3, 4, 5)
dfX = pd.DataFrame(x, columns=['Kolom X'], index=list('abcde'))
dfX

Unnamed: 0,Kolom X
a,1
b,2
c,3
d,4
e,5


In [60]:
# 3. create dataframe from set
x = {1, 2, 3, 4, 5}
dfX = pd.DataFrame(x, columns=['Kolom X'], index=list('abcde'))
dfX

Unnamed: 0,Kolom X
a,1
b,2
c,3
d,4
e,5


In [63]:
# 4. create dataframe from dictionary
x = {"x": [1, 2, 3, 4, 5]}
dfX = pd.DataFrame(x)
dfX

Unnamed: 0,x
0,1
1,2
2,3
3,4
4,5


In [86]:
# 5. create dataframe from series
x = pd.Series([1, 2, 3, 4, 5, 2])
dfX = pd.DataFrame(x, columns=['X'])
dfX

Unnamed: 0,X
0,1
1,2
2,3
3,4
4,5
5,2


In [87]:
# data view
print(dfX.shape) # (5, 1) 2D, 5 data, 1 kolom
print(dfX.index)
print(dfX.columns)

(6, 1)
RangeIndex(start=0, stop=6, step=1)
Index(['X'], dtype='object')


In [88]:
type(dfX)

pandas.core.frame.DataFrame

In [89]:
type(dfX['X'])

pandas.core.series.Series

In [91]:
print(dfX['X'].min())
print(dfX['X'].max())
print(dfX['X'].mean())
print(dfX['X'].median())
print(dfX['X'].mode()[0])

1
5
2.8333333333333335
2.5
2


In [96]:
x = {
    "X": [1, 2, 3, 4, 5],
    "Y": [2, 4, 6, 8, 10]
}
dfX = pd.DataFrame(x)
dfX.describe()

Unnamed: 0,X,Y
count,5.0,5.0
mean,3.0,6.0
std,1.581139,3.162278
min,1.0,2.0
25%,2.0,4.0
50%,3.0,6.0
75%,4.0,8.0
max,5.0,10.0


## Create DataFrame from Files

In [101]:
# read excel, need openpyxl module
# ! python -m pip install openpyxl
dataXlsx = pd.read_excel('data1xlsx.xlsx')
dataXlsx

Unnamed: 0,No,Nama,Usia,Kota
0,1,Andi,21,Jakarta
1,2,Budi,22,Bandung
2,3,Caca,23,Sukabumi
3,4,Dewi,24,Surabaya
4,5,Euis,25,Jayapura


In [102]:
dataXlsx = pd.read_excel('data1xlsx.xlsx', 'Sheet2')
dataXlsx

Unnamed: 0,No,Nama,Usia,Kota
0,1,Fafa,21,Jakarta
1,2,Gigih,22,Bandung
2,3,Hani,23,Sukabumi
3,4,Ijat,24,Surabaya
4,5,Janu,25,Jayapura


In [103]:
dataCsv = pd.read_csv('data1csv.csv')
dataCsv

Unnamed: 0,No,Nama,Usia,Kota
0,1,Fafa,21,Jakarta
1,2,Gigih,22,Bandung
2,3,Hani,23,Sukabumi
3,4,Ijat,24,Surabaya
4,5,Janu,25,Jayapura


In [109]:
dataJson = pd.read_json('data1json.json')
dataJson

Unnamed: 0,nama,usia,kota
0,Andi,20,Jakarta
1,Budi,21,Bandung
2,Caca,22,Sukabumi
3,Deni,23,Bogor
4,Euis,24,Jayapura


In [110]:
dataJson.describe()

Unnamed: 0,usia
count,5.0
mean,22.0
std,1.581139
min,20.0
25%,21.0
50%,22.0
75%,23.0
max,24.0


In [113]:
dfj = dataJson
dfj

Unnamed: 0,nama,usia,kota
0,Andi,20,Jakarta
1,Budi,21,Bandung
2,Caca,22,Sukabumi
3,Deni,23,Bogor
4,Euis,24,Jayapura


## Filtering

In [120]:
# tampilkan data-data pada kolom nama
# dfj['nama']  # output: series
dfj[['nama']]  # output: dataframe

Unnamed: 0,nama
0,Andi
1,Budi
2,Caca
3,Deni
4,Euis


In [125]:
# tampilkan nama-nama yang usianya >= 22
dfj[['nama', 'usia']][dfj['usia'] >= 22]

Unnamed: 0,nama,usia
2,Caca,22
3,Deni,23
4,Euis,24


In [127]:
dfj[dfj['usia'] >= 22][['nama', 'usia']]

Unnamed: 0,nama,usia
2,Caca,22
3,Deni,23
4,Euis,24


In [135]:
# tampilkan nama-nama yang usianya di bawah rata2 data pada json tsb
dfj[['nama', 'usia']][dfj['usia'] < dfj['usia'].mean()]

Unnamed: 0,nama,usia
0,Andi,20
1,Budi,21


In [139]:
# tampilkan nama orang yang usianya > 20 & asal kota Bogor
dfj[dfj['usia'] > 20][dfj['kota'] == 'Bogor']

  dfj[dfj['usia'] > 20][dfj['kota'] == 'Bogor']


Unnamed: 0,nama,usia,kota
3,Deni,23,Bogor


## Handling Missing Data

In [185]:
df = pd.read_csv('data2csv.csv')
df

Unnamed: 0,No,Nama,Usia,Kota
0,1,Fafa,21.0,Jakarta
1,2,,22.0,Bandung
2,3,Hani,23.0,
3,4,Ijat,,Surabaya
4,5,Janu,25.0,Jayapura


In [155]:
df['Usia'].mean()

18.2

In [156]:
type(df['Nama'][1])

float

In [157]:
# check data yang null
df.isnull()

Unnamed: 0,No,Nama,Usia,Kota
0,False,False,False,False
1,False,True,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


In [158]:
df.isnull().sum()

No      0
Nama    1
Usia    0
Kota    0
dtype: int64

In [182]:
# handle nama yang kosong:
# 1. cari nama yang benar!
# 2. teknis: isi dengan label kategory: "Anonymous"
# 3. hapus!
df = df.replace([np.nan], 'Zizi')
df

Unnamed: 0,No,Nama,Usia,Kota
0,1,Fafa,21,Jakarta
1,2,Zizi,22,Bandung
2,3,Hani,23,-
3,4,Ijat,0,Surabaya
4,5,Janu,25,Jayapura


In [179]:
import numpy as np
df['Nama'][1] == np.nan

False

In [180]:
np.nan == np.nan

False

In [186]:
## beberapa metode replace nan value!
# 1. replace np.nan need numpy
df1 = df.replace([np.nan], 'Zizi')
df1

Unnamed: 0,No,Nama,Usia,Kota
0,1,Fafa,21.0,Jakarta
1,2,Zizi,22.0,Bandung
2,3,Hani,23.0,Zizi
3,4,Ijat,Zizi,Surabaya
4,5,Janu,25.0,Jayapura


In [188]:
# 2. replace per kolom
df2 = df.replace({
    'Nama': np.nan,
    'Usia': np.nan,  
    'Kota': np.nan,
}, {
    'Nama': 'Zizi',
    'Usia': 20,  
    'Kota': 'Jakarta',
})
df2

Unnamed: 0,No,Nama,Usia,Kota
0,1,Fafa,21.0,Jakarta
1,2,Zizi,22.0,Bandung
2,3,Hani,23.0,Jakarta
3,4,Ijat,20.0,Surabaya
4,5,Janu,25.0,Jayapura


In [193]:
# 3. replace dengan fillna
# ffill = forward filling, mengisi data kosong dg data sebelumnya
# bfill = backward filling, mengisi data kosong dg data setelahnya

# df3 = df.fillna(method='ffill', axis='columns') 
# isi data kosong dg data di kolom sblmnya

df3 = df.fillna(method='bfill', axis='index') 
# isi data kosong dg data di baris setelahnya

df3

Unnamed: 0,No,Nama,Usia,Kota
0,1,Fafa,21.0,Jakarta
1,2,Hani,22.0,Bandung
2,3,Hani,23.0,Surabaya
3,4,Ijat,25.0,Surabaya
4,5,Janu,25.0,Jayapura


In [197]:
# 4. isi data kosong dg interpolasi
# isi data dengan nilai tengah antara nilai sebelum & setelah
df4 = df.interpolate()
df4

Unnamed: 0,No,Nama,Usia,Kota
0,1,Fafa,21.0,Jakarta
1,2,,22.0,Bandung
2,3,Hani,23.0,
3,4,Ijat,24.0,Surabaya
4,5,Janu,25.0,Jayapura


Tugas: 
1. Soal 2 - 👨‍🎓 Kerja Kerja Kerja (https://github.com/LintangWisesa/Ujian_AnalyticsVisualization_JCDS07)
2. Soal 2 - World Happiness (https://github.com/LintangWisesa/Ujian_AnalyticsVisualization_JCDS08)