In [1]:
import pandas as pd
T_df = pd.read_csv("./titanic_train.csv")

## [ Aggregation 함수 ]

#### count :  NaN 값은 제외하고 count 한다.

In [2]:
T_df.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

#### 특정 컬럼들로 mean 함수 수행 : 평균 값

In [3]:
T_df[["Age", "Fare"]].mean(axis=1)

0      14.62500
1      54.64165
2      16.96250
3      44.05000
4      21.52500
         ...   
886    20.00000
887    24.50000
888    23.45000
889    28.00000
890    19.87500
Length: 891, dtype: float64

#### 특정 컬럼들로 sum 함수 수행 : 합

In [6]:
T_df[["Age", "Fare"]].sum(axis=0)

Age     21205.1700
Fare    28693.9493
dtype: float64

#### 특정 컬럼들로 count 수행

In [7]:
T_df[["Age", "Fare"]].count()

Age     714
Fare    891
dtype: int64

## [ GroupBy ]

**groupby()의 by 인자에 컬럼을 입력하면서 DataFrame에 groupby()를 호출하면 DataFrameGroupBy 객체 반환**

#### DataFrameGroupBy 객체 만들기

In [9]:
T_gb = T_df.groupby(by="Pclass")

print(type(T_gb))
print(T_gb)

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000240901D6160>


#### DataFrameGroupBy 객체에 Aggregation 함수를 호출 해 Group by 수행

In [10]:
T_gb = T_df.groupby("Pclass").count()
T_gb

Unnamed: 0_level_0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,216,216,216,216,186,216,216,216,216,176,214
2,184,184,184,184,173,184,184,184,184,16,184
3,491,491,491,491,355,491,491,491,491,12,491


#### 위의 T_gb의 type, shape, index

In [12]:
print("T_gb의 type:", type(T_gb))
print("T_gb의 shape: ", T_gb.shape)
print("T_gb의 index 객체:", T_gb.index)

T_gb의 type: <class 'pandas.core.frame.DataFrame'>
T_gb의 shape:  (3, 11)
T_gb의 index 객체: Int64Index([1, 2, 3], dtype='int64', name='Pclass')


#### groupby 예시 1 - groupby 후,  특정 컬럼 데이터 추출 후, Aggregation count 수행

In [14]:
T_gb = T_df.groupby(by="Pclass")[["PassengerId", "Survived"]].count()
T_gb

Unnamed: 0_level_0,PassengerId,Survived
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,216,216
2,184,184
3,491,491


#### groupby 예시 2 - groupby 후, 단일 컬럼 데이터 추출 후, Aggregation count 수행

In [16]:
T_df.groupby("Pclass")["Pclass"].count()
T_df["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

#### agg() : 여러 개의 Aggregation 함수를 적용할 수 있도록 한다.

#### *agg() 예시 1*

In [17]:
T_df.groupby("Pclass")["Age"].agg([max, min])

Unnamed: 0_level_0,max,min
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80.0,0.92
2,70.0,0.67
3,74.0,0.42


#### *agg() 예시 2 - dictionary 활용*

In [18]:
Agg_format = {"Age":"max", "SibSp":"sum", "Fare":"mean"}
T_df.groupby("Pclass").agg(Agg_format)

Unnamed: 0_level_0,Age,SibSp,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80.0,90,84.154687
2,70.0,74,20.662183
3,74.0,302,13.67555


## [ Missing data 처리 ]

#### isna() : 컬럼이 NaN인 지 True/False 값을 반환한다. (NaN이면 True)

In [19]:
T_df.isna().head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False


#### isna() 반환 결과에 sum()을 호출 : 컬럼별로 NaN 건수 구할 수 있음

In [20]:
T_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

#### fillna() : missing data 대체하는 메소드

#### *fillna 예시 1*

In [21]:
T_df["Cabin"] = T_df["Cabin"].fillna("C000")
T_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,C000,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,C000,S


#### *fillna 예시 2*

In [22]:
# NaN에 평균값 넣기
T_df["Age"] = T_df["Age"].fillna(T_df["Age"].mean())

# NaN에 'S' 넣기
T_df["Embarked"] = T_df["Embarked"].fillna('S')

# print
T_df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

## [ apply lambda ]

#### 파이썬에서 lambda 쓰지 않은 예제

In [23]:
def get_square(a):
    return a**2

print("3의 제곱은: ", get_square(3))

3의 제곱은:  9


#### 파이썬에서 lambda 쓰는 예제 1

In [24]:
Lambda_S = lambda x : x ** 2

print("3의 제곱은: ", Lambda_S(3))

3의 제곱은:  9


#### 파이썬에서 lambda 쓰는 예제 2

In [25]:
A = [1,2,3]
Sq = map(lambda x : x**2, A)
list(Sq)

[1, 4, 9]

#### 판다스에서 apply lambda 쓰는 예제 1 - Name_len

In [26]:
T_df["Name_len"] = T_df["Name"].apply(lambda x : len(x))
T_df[["Name", "Name_len"]].head(3)

Unnamed: 0,Name,Name_len
0,"Braund, Mr. Owen Harris",23
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51
2,"Heikkinen, Miss. Laina",22


#### 판다스에서 apply lambda 쓰는 예제 2 - Child_Adult

In [27]:
T_df["Child_Adult"] = T_df["Age"].apply(lambda x : 'Child' if x <=15 else 'Adult' )
T_df[["Age", "Child_Adult"]]

Unnamed: 0,Age,Child_Adult
0,22.000000,Adult
1,38.000000,Adult
2,26.000000,Adult
3,35.000000,Adult
4,35.000000,Adult
...,...,...
886,27.000000,Adult
887,19.000000,Adult
888,29.699118,Adult
889,26.000000,Adult


#### 판다스에서 apply lambda 쓰는 예제 3 - Age_cat

In [28]:
T_df["Age_cat"] = T_df["Age"].apply(lambda x : 'Child' if x<=15 else ('Adult' if x <= 60 else'Elderly'))
T_df["Age_cat"].value_counts()

Adult      786
Child       83
Elderly     22
Name: Age_cat, dtype: int64

#### 판다스에서 apply lambda 쓰는 예제 4 - Age_cat

In [29]:
def get_category(age):
    cat = ''
    if age <= 5: cat = "Baby"
    elif age <= 12: cat = "Child"
    elif age <= 18: cat = "Teenager"
    elif age <= 25: cat = "Student"
    elif age <= 35: cat = "Young Adult"
    elif age <= 60: cat = "Adult"
    else : cat = "Elderly"
    
    return cat

T_df["Age_cat"] = T_df["Age"].apply(lambda x : get_category(x))
T_df[["Age", "Age_cat"]].head()

Unnamed: 0,Age,Age_cat
0,22.0,Student
1,38.0,Adult
2,26.0,Young Adult
3,35.0,Young Adult
4,35.0,Young Adult
