# 라이브러리 불러오기

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl

print(np.__version__)
print(pd.__version__)
print(sns.__version__)
print(mpl.__version__)


2.2.4
2.2.3
0.13.2
3.10.1


# 샘플 데이터 가져오기

In [4]:
iris = sns.load_dataset("iris")
iris.head(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa


In [5]:
tips = sns.load_dataset("tips")
tips.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


# 결측치 확인
- 데이터가 비어 있나?

In [8]:
iris.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [9]:
iris.shape

(150, 5)

In [10]:
tips.shape

(244, 7)

In [12]:
a = tips['day']
type(a)

pandas.core.series.Series

In [13]:
type(tips)

pandas.core.frame.DataFrame

In [14]:
tips['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

# 상위 5개만 보기
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.nlargest.html
- 파생 함수
  + DataFrame.nsmallest
  + DataFrame.sort_values
  + DataFrame.head

In [15]:
# 숫자열을 sort() 내림차순 정렬
# 상위 5개만 인덱싱
iris.nlargest(5, "sepal_length")

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
131,7.9,3.8,6.4,2.0,virginica
117,7.7,3.8,6.7,2.2,virginica
118,7.7,2.6,6.9,2.3,virginica
122,7.7,2.8,6.7,2.0,virginica
135,7.7,3.0,6.1,2.3,virginica


# 필터링
- NumPy와 문법 동일

In [17]:
# tips의 평균 구하기
mean_tip = tips['tip'].mean()
mean_tip

np.float64(2.99827868852459)

In [21]:
# 평균보다 큰 데이터만 조회
tips[tips['tip'] > mean_tip].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3


In [20]:
# 숙련도가 높으면 객체 생성 대신 바로 넣을 것.
tips[tips['tip'] > tips['tip'].mean()].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3


## - 특정 항목의 특정 값을 가진 데이터 출력

In [30]:
tips[tips['smoker']=='No'].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


In [31]:
tips[tips['time']=='Dinner'].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


In [33]:
# 조회 후 항상 reset_index로 초기화할 것.
tips[tips['time']=='Dinner'].reset_index(drop=True).head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


# loc vs iloc
- 코드 비교

In [38]:
# tips.loc[tips['day']=='Sat]
# tipw.loc[행, 열]
tips.loc[0:2, ['total_bill','tip','day']]

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun
1,10.34,1.66,Sun
2,21.01,3.5,Sun


In [39]:
# iloc
tips.iloc[0:2, [0,1,4]]

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun
1,10.34,1.66,Sun


- iloc는 인덱스기반. 마지막 숫자는 포함시키지 않는다.
- loc는 인덱스 기반이 아니라 마지막 숫자를 포함한다.

In [41]:
tips.loc[tips['time']=='Dinner', ['total_bill','tip','day','time']]

Unnamed: 0,total_bill,tip,day,time
0,16.99,1.01,Sun,Dinner
1,10.34,1.66,Sun,Dinner
2,21.01,3.50,Sun,Dinner
3,23.68,3.31,Sun,Dinner
4,24.59,3.61,Sun,Dinner
...,...,...,...,...
239,29.03,5.92,Sat,Dinner
240,27.18,2.00,Sat,Dinner
241,22.67,2.00,Sat,Dinner
242,17.82,1.75,Sat,Dinner


In [44]:
tips.loc[tips['total_bill']<=11,:].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3


In [46]:
tips.loc[tips['time']=='Dinner',:].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


In [55]:
# tips.loc[tips['time']=='Dinner',:].loc[tips['total_bill']<=11,:]
tips.loc[(tips['time'] == 'Dinner') & (tips['total_bill'] <= 11),:].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3


In [67]:
iris.shape

(150, 5)

In [63]:
# 품종이 virginica or sepal_length >= 5
# cl sepal_length, petal_length, species
iris.loc[iris['species'] == 'virginica',:].shape

(50, 5)

In [64]:
iris.loc[iris['sepal_length'] >= 5,:].shape

(128, 5)

In [65]:
iris.loc[(iris['species']=='virginica') | (iris['sepal_length'] >= 5),['sepal_length','petal_length','species']].shape

(129, 3)

In [68]:
iris.loc[(iris['species']=='virginica') | (iris['sepal_length'] >= 5),['sepal_length','petal_length','species']].head(1)

Unnamed: 0,sepal_length,petal_length,species
0,5.1,1.4,setosa


# 파일 입출력
- csv
- excel

In [70]:
import seaborn as sns
import pandas as pd

iris = sns.load_dataset("iris")
result = iris.loc[:,['sepal_length','species']]
result

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica


## CVS

In [74]:
# 내보내기
result.to_csv("dataset/iris_result.csv", index=False)

In [75]:
# 불러오기
iris_df = pd.read_csv("./dataset/iris_result.csv")
iris_df

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica
