In [7]:
# DataFrame에서 중복을 처리하려면 어떻게 해야 하나요?
# 일반적으로 중복데이터인 경우에는 삭제하는게 보통!
# 특이한 경우 중복데이터를 그냥 사용하는 경우도 있는데 -> 특이한 케이스 (일반적 X)
import numpy as np
import pandas as pd

data = {
    'c1' : ['a', 'a', 'b', 'a', 'c'],
    'c2' : [1, 1, 1, 2, 3],
    'c3' : [1, 1, 2, 2, 2]
}

df = pd.DataFrame(data)
display(df)
print()

print(df.loc[2, ['c1', 'c2']])

# Boolean Mask를 생성할 수 있어요!
new_df = df.duplicated()
display(new_df)
print()

# 이 duplicated()함수를 Series에도 적용할 수 있어요!
my_col = df['c2'].duplicated()
display(my_col)
print()

# 중복데이터를 그냥 제거해주는 함수도 있어요!
new_df = df.drop_duplicates()
display(new_df)
print()

# subset을 사용하여 기준을 정해줄 수 있어요!
new_df = df.drop_duplicates(subset=['c2', 'c3'])
display(new_df)
print()

Unnamed: 0,c1,c2,c3
0,a,1,1
1,a,1,1
2,b,1,2
3,a,2,2
4,c,3,2



c1    b
c2    1
Name: 2, dtype: object


Unnamed: 0,0
0,False
1,True
2,False
3,False
4,False





Unnamed: 0,c2
0,False
1,True
2,True
3,False
4,False





Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2
4,c,3,2





Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2
4,c,3,2





In [12]:
# Binning 처리 (구간 분할 처리)
# MPG Data set을 예로 처리해보아요!
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/KDT/data/auto-mpg.csv',
                 header=None)
df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'moder_year', 'origin', 'name']
display(df.head())
df.info()
print()

# 해당 Series(df['horepower'])에 숫자가 아닌 값이 있는지 확인해 보아요!
# 현재 데이터 타입이 object 로 되어 있어서 정규식을 이용해서 처리해야할듯 보여요!
# 사실 horsepower에 '?' 문자가 들어있어요!
# horsepower를 알 수 없어서 대신 '?'문자를 데이터에 넣어놓은건데
# 이건 어떻게 처리해야 하나요?
df['horsepower'].replace('?', np.nan, inplace=True)
df.dropna(subset=['horsepower'], axis=0, inplace=True)
df['horsepower'] = df['horsepower'].astype('float')
df.info()
print()

# 이제 이 자동차의 마력(출력)을 Binning 처리 해보아요!
# 저출력, 보통 출력, 고출력의 형태로 Binning 처리를 해보아요!
# 이런 Binning 처리할 때 일단 NumPy가 가지고 있는 histogram()을 이용하면 편해요.
# 특정 구간에 데이터가 몇개 있는지 알려줘요!
count, divider = np.histogram(df['horsepower'], bins=3)
print(count)
print(divider)

bin_name = ['저출력', '보통 출력', '고출력']
df['hp_cut'] = pd.cut(x=df['horsepower'], bins=divider,
                      labels=bin_name, include_lowest=True)
display(df)
print()

# 이렇게 category 데이터 형태로 기존 데이터를 변형시킬수 있어요!
# 여기서 하나 더 추가적으로 작업해야 하는 경우가 있어요! -> one-hot encoding
horsepower_dummies = pd.get_dummies(df['hp_cut']) # Series => DataFrame
display(horsepower_dummies)



Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,moder_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   moder_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['horsepower'].replace('?', np.nan, inplace=True)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,moder_year,origin,name,hp_cut
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,보통 출력
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,보통 출력
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,보통 출력
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,보통 출력
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,보통 출력
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl,저출력
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup,저출력
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage,저출력
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger,저출력





Unnamed: 0,저출력,보통 출력,고출력
0,False,True,False
1,False,True,False
2,False,True,False
3,False,True,False
4,False,True,False
...,...,...,...
393,True,False,False
394,True,False,False
395,True,False,False
396,True,False,False


In [None]:
# 데이터 정규화(Featrue Scaling) : 각 컬럼의 데이터의 값이 차이가 많이 날 경우 큰 값에 더 가중치가 실리기 때문에 각 컬럼의 데이터를 정규화 해야해요!
# Min-Max Scaling : 최대값과 최소값을 이용하여 각 데이터를 0~1의 실수로 정규화 하는 방법! (X - X_min) / (X_max - X_min)
# Min-Max Scaling이ㅡ 경우 최대, 최소값을 이용하기 때문에 이상치에 매우 민감해요!
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/KDT/data/auto-mpg.csv',
                 header=None)
df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'moder_year', 'origin', 'name']

df['horsepower'].replace('?', np.nan, inplace=True)
df.dropna(subset=['horsepower'], axis=0, inplace=True)
df['horsepower'] = df['horsepower'].astype('float')
display(df['horsepower'])
print()

# 해당 컬럼을 min-max scaling을 이용해서 정규화 시킬거에요!
max_min = df['horsepower'].max() - df['horsepower'].min()
df['horsepower_norm'] = (df['horsepower'] - df['horsepower'].min()) / max_min
display(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['horsepower'].replace('?', np.nan, inplace=True)


Unnamed: 0,horsepower
0,130.0
1,165.0
2,150.0
3,150.0
4,140.0
...,...
393,86.0
394,52.0
395,84.0
396,79.0





Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,moder_year,origin,name,horsepower_norm
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,0.456522
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,0.646739
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,0.565217
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,0.565217
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,0.510870
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl,0.217391
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup,0.032609
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage,0.206522
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger,0.179348


In [None]:
# DataFrame을 결합할 수 있어요!
# merge()를 이용해서 결합해요! => Database join

data1 = {
    '학번' : [1, 2, 3, 4],
    '이름' : ['홍길동', '신사임당', '강감찬', '이순신'],
    '학년' : [1, 3, 2, 4]
}

data2 = {
    '학번' : [1, 2, 4, 5],
    '학과' : ['철학', '국어국문', '기계', '컴퓨터'],
    '학점' : [3.1, 2.5, 1.6, 4.5]
}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
display(df1, df2)
print()

result1 = pd.merge(df1, df2, on='학번', how='inner')
result2 = pd.merge(df1, df2, on='학번', how='outer')
result3 = pd.merge(df1, df2, on='학번', how='left')
display(result1, result2, result3)

Unnamed: 0,학번,이름,학년
0,1,홍길동,1
1,2,신사임당,3
2,3,강감찬,2
3,4,이순신,4


Unnamed: 0,학번,학과,학점
0,1,철학,3.1
1,2,국어국문,2.5
2,4,기계,1.6
3,5,컴퓨터,4.5





Unnamed: 0,학번,이름,학년,학과,학점
0,1,홍길동,1,철학,3.1
1,2,신사임당,3,국어국문,2.5
2,4,이순신,4,기계,1.6


Unnamed: 0,학번,이름,학년,학과,학점
0,1,홍길동,1.0,철학,3.1
1,2,신사임당,3.0,국어국문,2.5
2,3,강감찬,2.0,,
3,4,이순신,4.0,기계,1.6
4,5,,,컴퓨터,4.5


Unnamed: 0,학번,이름,학년,학과,학점
0,1,홍길동,1,철학,3.1
1,2,신사임당,3,국어국문,2.5
2,3,강감찬,2,,
3,4,이순신,4,기계,1.6


In [None]:
data1 = {
    '학번' : [1, 2, 3, 4],
    '이름' : ['홍길동', '신사임당', '강감찬', '이순신'],
    '학년' : [1, 3, 2, 4]
}

data2 = {
    '학생학번' : [1, 2, 4, 5],
    '학과' : ['철학', '국어국문', '기계', '컴퓨터'],
    '학점' : [3.1, 2.5, 1.6, 4.5]
}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# 컬럼명이 다른 경우에는 어떻게 해야하나요?
result = pd.merge(df1, df2, left_on='학번', right_on='학생학번')
display(result)

Unnamed: 0,학번,이름,학년,학생학번,학과,학점
0,1,홍길동,1,1,철학,3.1
1,2,신사임당,3,2,국어국문,2.5
2,4,이순신,4,4,기계,1.6


In [None]:
data1 = {
    '학번' : [1, 2, 3, 4],
    '이름' : ['홍길동', '신사임당', '강감찬', '이순신'],
    '학년' : [1, 3, 2, 4]
}

data2 = {
    '학과' : ['철학', '국어국문', '기계', '컴퓨터'],
    '학점' : [3.1, 2.5, 1.6, 4.5]
}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2, index=[1, 2, 4, 5])
display(df1, df2)
print()

result = pd.merge(df1, df2, left_on='학번', right_index=True, how='inner')
display(result)

Unnamed: 0,학번,이름,학년
0,1,홍길동,1
1,2,신사임당,3
2,3,강감찬,2
3,4,이순신,4


Unnamed: 0,학과,학점
1,철학,3.1
2,국어국문,2.5
4,기계,1.6
5,컴퓨터,4.5





Unnamed: 0,학번,이름,학년,학과,학점
0,1,홍길동,1,철학,3.1
1,2,신사임당,3,국어국문,2.5
3,4,이순신,4,기계,1.6


In [None]:
# titanic 데이터를 이용해서 함수 mapping과
# grouping 하는 내용에 대해서 알아보아요!

import numpy as np
import pandas as pd
import seaborn as sns

# Raw data Loading
titanic = sns.load_dataset('titanic')
display(titanic.head())
print()

# 모든 행에 대해 age, fare 컬럼만 들고 올거에요!
df = titanic.loc[:, ['age', 'fare']]
display(df.head())
print()

# 사용자 정의 함수를 하나 만들어요!
def add_10(n):
    return n + 10

def add_two_obj(a, b):
    return a + b

# age 열의 모든 행에 대해서 함수를 적용해서 10씩 증가
# Series에 특정 함수를 적용하려면 어떻게 해야하나요?
s1 = df['age'].apply(add_10)
display(s1.head())
print()

s2 = df['age'].apply(add_two_obj, b=100)
display(s2.head())
print()

s3 = df['age'].apply(lambda x : x + 30)
display(s3.head())
print()

# Series 말고 DataFrame에 함수를 적용하려면 어떻게 해야하나요?
result = df.applymap(add_10)
display(result.head())

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True





Unnamed: 0,age,fare
0,22.0,7.25
1,38.0,71.2833
2,26.0,7.925
3,35.0,53.1
4,35.0,8.05





Unnamed: 0,age
0,32.0
1,48.0
2,36.0
3,45.0
4,45.0





Unnamed: 0,age
0,122.0
1,138.0
2,126.0
3,135.0
4,135.0





Unnamed: 0,age
0,52.0
1,68.0
2,56.0
3,65.0
4,65.0





  result = df.applymap(add_10)


Unnamed: 0,age,fare
0,32.0,17.25
1,48.0,81.2833
2,36.0,17.925
3,45.0,63.1
4,45.0,18.05


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

# Raw data Loading
titanic = sns.load_dataset('titanic')
display(titanic.head())
print()

# 모든 행에 대해 age, fare 컬럼만 들고 올거에요!
df = titanic.loc[:, ['age', 'fare']]
display(df.head())
print()

# 정규화(Normalization) - 최대, 최소를 이용한 정규화
# MinMaxScaling을 구현래 보아요!

# 사용자 정의 함수
def min_max(s):
    return (s - s.min()) / (s.max() - s.min())

result1 = df.apply(min_max, axis=0)
display(result1.head())

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True





Unnamed: 0,age,fare
0,22.0,7.25
1,38.0,71.2833
2,26.0,7.925
3,35.0,53.1
4,35.0,8.05





Unnamed: 0,age,fare
0,0.271174,0.014151
1,0.472229,0.139136
2,0.321438,0.015469
3,0.434531,0.103644
4,0.434531,0.015713


In [None]:
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')
df = titanic.loc[:,['age', 'fare']]

display(df.head())

# 사용자 함수(정규화)
def min_max(s):
    return (s - s.min()) / (s.max() - s.min())

# apply로 적용할 때 axis=0의 의미는 모든 열을 분리하여
# 함수를 적용시키라는 의미.
result1 = df.apply(min_max, axis=1)
display(result1.head())

# 사용자 함수
def add_two_obj(a,b):
    return a + b

df['add'] = df.apply(lambda x: add_two_obj(x['age'], x['fare']), axis=1)
display(df.head())

Unnamed: 0,age,fare
0,22.0,7.25
1,38.0,71.2833
2,26.0,7.925
3,35.0,53.1
4,35.0,8.05


Unnamed: 0,age,fare
0,1.0,0.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,1.0,0.0


Unnamed: 0,age,fare,add
0,22.0,7.25,29.25
1,38.0,71.2833,109.2833
2,26.0,7.925,33.925
3,35.0,53.1,88.1
4,35.0,8.05,43.05


In [None]:
# DataFrame의 Grouping
import numpy as np
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')

# 필요한 몇개의 컬럼만 추려서 사용할거에요!
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]
display(df.head())
print()

grouped = df.groupby('class')
print(grouped) # 그룹 객체가 생성되요!
print()

for (key, group) in grouped:
    print(key)
    display(group)
print()

# 개별 그룹을 가져오는 것부터 알아보아요!
group3 = grouped.get_group('Third')
display(group3.head())

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
4,35.0,male,Third,8.05,0



<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7b247e15bad0>

First


  grouped = df.groupby('class')


Unnamed: 0,age,sex,class,fare,survived
1,38.0,female,First,71.2833,1
3,35.0,female,First,53.1000,1
6,54.0,male,First,51.8625,0
11,58.0,female,First,26.5500,1
23,28.0,male,First,35.5000,1
...,...,...,...,...,...
871,47.0,female,First,52.5542,1
872,33.0,male,First,5.0000,0
879,56.0,female,First,83.1583,1
887,19.0,female,First,30.0000,1


Second


Unnamed: 0,age,sex,class,fare,survived
9,14.0,female,Second,30.0708,1
15,55.0,female,Second,16.0000,1
17,,male,Second,13.0000,1
20,35.0,male,Second,26.0000,0
21,34.0,male,Second,13.0000,1
...,...,...,...,...,...
866,27.0,female,Second,13.8583,1
874,28.0,female,Second,24.0000,1
880,25.0,female,Second,26.0000,1
883,28.0,male,Second,10.5000,0


Third


Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.2500,0
2,26.0,female,Third,7.9250,1
4,35.0,male,Third,8.0500,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.0750,0
...,...,...,...,...,...
882,22.0,female,Third,10.5167,0
884,25.0,male,Third,7.0500,0
885,39.0,female,Third,29.1250,0
888,,female,Third,23.4500,0





Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
2,26.0,female,Third,7.925,1
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.075,0


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')

# 필요한 몇개의 컬럼만 추려서 사용할거에요!
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]
display(df.head())
print()

# group을 조금 복잡하게 설정할 거에요!
# group안에 다시 group을 만들거에요!
grouped = df.groupby(['class', 'sex']) # class 3개 * sex 2개 = group 6개

for key, group in grouped:
    print(key)
    display(group.head())
print()

group3 = grouped.get_group(('Third', 'female'))
display(group3.head())
print()

result = grouped.mean()
display(result)

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
4,35.0,male,Third,8.05,0



('First', 'female')


  grouped = df.groupby(['class', 'sex']) # class 3개 * sex 2개 = group 6개


Unnamed: 0,age,sex,class,fare,survived
1,38.0,female,First,71.2833,1
3,35.0,female,First,53.1,1
11,58.0,female,First,26.55,1
31,,female,First,146.5208,1
52,49.0,female,First,76.7292,1


('First', 'male')


Unnamed: 0,age,sex,class,fare,survived
6,54.0,male,First,51.8625,0
23,28.0,male,First,35.5,1
27,19.0,male,First,263.0,0
30,40.0,male,First,27.7208,0
34,28.0,male,First,82.1708,0


('Second', 'female')


Unnamed: 0,age,sex,class,fare,survived
9,14.0,female,Second,30.0708,1
15,55.0,female,Second,16.0,1
41,27.0,female,Second,21.0,0
43,3.0,female,Second,41.5792,1
53,29.0,female,Second,26.0,1


('Second', 'male')


Unnamed: 0,age,sex,class,fare,survived
17,,male,Second,13.0,1
20,35.0,male,Second,26.0,0
21,34.0,male,Second,13.0,1
33,66.0,male,Second,10.5,0
70,32.0,male,Second,10.5,0


('Third', 'female')


Unnamed: 0,age,sex,class,fare,survived
2,26.0,female,Third,7.925,1
8,27.0,female,Third,11.1333,1
10,4.0,female,Third,16.7,1
14,14.0,female,Third,7.8542,0
18,31.0,female,Third,18.0,0


('Third', 'male')


Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.075,0
12,20.0,male,Third,8.05,0





Unnamed: 0,age,sex,class,fare,survived
2,26.0,female,Third,7.925,1
8,27.0,female,Third,11.1333,1
10,4.0,female,Third,16.7,1
14,14.0,female,Third,7.8542,0
18,31.0,female,Third,18.0,0





Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare,survived
class,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,female,34.611765,106.125798,0.968085
First,male,41.281386,67.226127,0.368852
Second,female,28.722973,21.970121,0.921053
Second,male,30.740707,19.741782,0.157407
Third,female,21.75,16.11881,0.5
Third,male,26.507589,12.661633,0.135447


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')

# 필요한 몇개의 컬럼만 추려서 사용할거에요!
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]
display(df.head())
print()

grouped = df.groupby('class')
display(grouped.survived.mean(), grouped.fare.mean())
display(grouped.fare.sum())
print(grouped.fare.sum().max())
print()

# age열의 평균값이 30보다 작은 그룹만을 따로 선택.
# 생성된 그룹에 대해 조건을 설정해 조건을 만족하는 그룹만 따로 선책하려면 어떻게 해야하나요?
# filter()

grouped_filter = grouped.filter(lambda x: len(x) >= 300 )
display(grouped_filter.head())

group_filter = grouped.filter(lambda x : x.age.mean() < 30)
display(group_filter.head())

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
4,35.0,male,Third,8.05,0





  grouped = df.groupby('class')


Unnamed: 0_level_0,survived
class,Unnamed: 1_level_1
First,0.62963
Second,0.472826
Third,0.242363


Unnamed: 0_level_0,fare
class,Unnamed: 1_level_1
First,84.154687
Second,20.662183
Third,13.67555


Unnamed: 0_level_0,fare
class,Unnamed: 1_level_1
First,18177.4125
Second,3801.8417
Third,6714.6951


18177.4125



Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
2,26.0,female,Third,7.925,1
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.075,0


Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
2,26.0,female,Third,7.925,1
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.075,0
