### Create DataFrame

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    "name" : ["Anna","Dane","David","Kevin","Joe","Rose"],
    "weight" : [45.2,50.5,62.1,64.0, 59.0, np.nan],
    "height" : [155,160,162,170,167,159],
    "birth_year" : [2000,1997,1996,1987,1998,2003],
    "gender" : ["female","male","male","male","male","female"],
    "city" : ["New York","Chicago", "Austin", "New York", np.nan,"New York"]
    })
df

Unnamed: 0,name,weight,height,birth_year,gender,city
0,Anna,45.2,155,2000,female,New York
1,Dane,50.5,160,1997,male,Chicago
2,David,62.1,162,1996,male,Austin
3,Kevin,64.0,170,1987,male,New York
4,Joe,59.0,167,1998,male,
5,Rose,,159,2003,female,New York


### DataFrame Basic Info

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        6 non-null      object 
 1   weight      5 non-null      float64
 2   height      6 non-null      int64  
 3   birth_year  6 non-null      int64  
 4   gender      6 non-null      object 
 5   city        5 non-null      object 
dtypes: float64(1), int64(2), object(3)
memory usage: 416.0+ bytes


### DataFrame Basic Statistics Information For Numerical Columns

In [4]:
df.describe()

Unnamed: 0,weight,height,birth_year
count,5.0,6.0,6.0
mean,56.16,162.166667,1996.833333
std,8.015173,5.492419,5.419102
min,45.2,155.0,1987.0
25%,50.5,159.25,1996.25
50%,59.0,161.0,1997.5
75%,62.1,165.75,1999.5
max,64.0,170.0,2003.0


### Reorder The Columns

In [5]:
df[["name","birth_year","height","weight","city","gender"]]

Unnamed: 0,name,birth_year,height,weight,city,gender
0,Anna,2000,155,45.2,New York,female
1,Dane,1997,160,50.5,Chicago,male
2,David,1996,162,62.1,Austin,male
3,Kevin,1987,170,64.0,New York,male
4,Joe,1998,167,59.0,,male
5,Rose,2003,159,,New York,female


### Print With Criteria Men That Born After The Year 1996

In [6]:
df[(df["gender"]=="male") & (df["birth_year"]>1996)]

Unnamed: 0,name,weight,height,birth_year,gender,city
1,Dane,50.5,160,1997,male,Chicago
4,Joe,59.0,167,1998,male,


### Print With Criteria Women With Height >157

In [7]:
df[(df["gender"]=="female") & (df["height"]>157)]

Unnamed: 0,name,weight,height,birth_year,gender,city
5,Rose,,159,2003,female,New York


### Fill NaN Of Weight With Mean Of The Gender

In [8]:
import statistics as stat

In [9]:
df2 = df[(df["gender"]=="female")].dropna(subset=["weight"])
df2

Unnamed: 0,name,weight,height,birth_year,gender,city
0,Anna,45.2,155,2000,female,New York


In [10]:
df["weight"]=df["weight"].fillna(stat.mean(df2["weight"]))
df

Unnamed: 0,name,weight,height,birth_year,gender,city
0,Anna,45.2,155,2000,female,New York
1,Dane,50.5,160,1997,male,Chicago
2,David,62.1,162,1996,male,Austin
3,Kevin,64.0,170,1987,male,New York
4,Joe,59.0,167,1998,male,
5,Rose,45.2,159,2003,female,New York


### Fill NaN Of City With Mode

In [11]:
df["city"]=df["city"].fillna(df["city"].mode()[0])
df

Unnamed: 0,name,weight,height,birth_year,gender,city
0,Anna,45.2,155,2000,female,New York
1,Dane,50.5,160,1997,male,Chicago
2,David,62.1,162,1996,male,Austin
3,Kevin,64.0,170,1987,male,New York
4,Joe,59.0,167,1998,male,New York
5,Rose,45.2,159,2003,female,New York


### Check Outliers

#### Z-Score

In [12]:
import scipy.stats as stats

In [13]:
df[(np.abs(stats.zscore(df["weight"]))>=3)]

Unnamed: 0,name,weight,height,birth_year,gender,city


#### IQR

In [14]:
q1 = df["weight"].quantile(0.25)
q3 = df["weight"].quantile(0.75)

iqr = q3-q1 #Interquartile range
fence_low  = q1-1.5*iqr
fence_high = q3+1.5*iqr

df.loc[(df["weight"] < fence_low) | (df["weight"] > fence_high)]

Unnamed: 0,name,weight,height,birth_year,gender,city


Tidak terdapat outlier pada weight