In [1]:
# 데이터 분석에 사용되는 표준라이브러리 로딩작업
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import matplotlib as mpl

In [2]:
# 1차원 배열 : 시리즈(Series) = 값(values)과 인덱스(index)
#sr = pd.Series(data, index, columns)
sr = pd.Series([17000, 18000, 1000, 5000], 
               index=["피자", "치킨", "콜라", "맥주"])
sr

피자    17000
치킨    18000
콜라     1000
맥주     5000
dtype: int64

In [3]:
type(sr)  # 객체의 자료형

pandas.core.series.Series

In [5]:
# 데이터값만 추출
sr.values  # dtype -> 자료들이 가진 타입

array([17000, 18000,  1000,  5000], dtype=int64)

In [6]:
# 인덱스만 추출
sr.index # object -> 객체-문자열 객체

Index(['피자', '치킨', '콜라', '맥주'], dtype='object')

In [7]:
# 2차원 배열 : 데이터프레임(DataFrame)
# 행(index)와 열(column) 그리고 값(values)
values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
index = ['one', 'two', 'three']
columns = ['A', 'B', 'C']

df = pd.DataFrame(values, index=index, columns=columns)
df

Unnamed: 0,A,B,C
one,1,2,3
two,4,5,6
three,7,8,9


In [8]:
df.index

Index(['one', 'two', 'three'], dtype='object')

In [9]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [10]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int64)

In [18]:
# 데이터프레임 : 리스트, 시리즈, 딕셔너리, 넘파이배열 등으로 생성
# 리스트를 이용한 데이터 프레임 생성
data = [
    ['1000', 'Steve', 90.72],
    ['1001', 'James', 78.09],
    ['1002', 'Doyeon', 98.43],
    ['1003', 'Jane', 64.19],
    ['1004', 'Pilwoong', 81.30],
    ['1005', 'Tony', 99.14]
]

df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,1000,Steve,90.72
1,1001,James,78.09
2,1002,Doyeon,98.43
3,1003,Jane,64.19
4,1004,Pilwoong,81.3
5,1005,Tony,99.14


In [19]:
df = pd.DataFrame(data, columns=['학번', '이름', '점수'])
df

Unnamed: 0,학번,이름,점수
0,1000,Steve,90.72
1,1001,James,78.09
2,1002,Doyeon,98.43
3,1003,Jane,64.19
4,1004,Pilwoong,81.3
5,1005,Tony,99.14


In [21]:
# 딕셔너리를 이용한 데이터프레임 생성
data = {
    '학번':['1000', '1001', '1002', '1003', '1004', '1005'],
    '이름':['aaa', 'bbb', 'ccc', 'ddd', 'eee', 'fff'],
    '점수':[90.72, 78.09, 98.43, 64.19, 81.30, 99.14]
}

df = pd.DataFrame(data)
df

Unnamed: 0,학번,이름,점수
0,1000,aaa,90.72
1,1001,bbb,78.09
2,1002,ccc,98.43
3,1003,ddd,64.19
4,1004,eee,81.3
5,1005,fff,99.14


In [22]:
# 데이터프레임 조회하는 작업
# df.head(n) : 데이터프레임의 앞에서 n개의 자료만 추출 (일반적으로 5개) - 알은 일반적으로 6개...
# head(df, n)
# df.tail(n) : 데이터프레임의 뒷부분에서 n개의 자료만 추출
# df['열이름'] : 해당 열을 추출

In [23]:
df.head()

Unnamed: 0,학번,이름,점수
0,1000,aaa,90.72
1,1001,bbb,78.09
2,1002,ccc,98.43
3,1003,ddd,64.19
4,1004,eee,81.3


In [25]:
df.head(3)

Unnamed: 0,학번,이름,점수
0,1000,aaa,90.72
1,1001,bbb,78.09
2,1002,ccc,98.43


In [27]:
df.tail(3)

Unnamed: 0,학번,이름,점수
3,1003,ddd,64.19
4,1004,eee,81.3
5,1005,fff,99.14


In [28]:
df['학번']

0    1000
1    1001
2    1002
3    1003
4    1004
5    1005
Name: 학번, dtype: object

In [29]:
# 학번과 이름만 추출
df[['학번', '이름']]

Unnamed: 0,학번,이름
0,1000,aaa
1,1001,bbb
2,1002,ccc
3,1003,ddd
4,1004,eee
5,1005,fff


In [57]:
# 1행의 데이터만 출력
# 데이터프레임 [행인덱스, 열인덱스]
#df[1, :]

df.loc[1]

학번     1001
이름      bbb
점수    78.09
Name: 1, dtype: object

In [54]:
# loc[행인덱스값, 열인덱스값], 라벨값 기반의 2차원 인덱싱
# df.loc[행인덱싱값]  or  df.loc[행인덱싱값, 열인덱싱값]
# iloc : 순서를 기반으로 정수 기반의 2차원 인덱싱
df.iloc[[1]]  

Unnamed: 0,학번,이름,점수
1,1001,bbb,78.09


In [60]:
df = pd.DataFrame(np.arange(10,22).reshape(3, 4),
                 index = ['a','b','c'],
                 columns = ['A', 'B', 'C', 'D'])
df  # reshape -> 1차원배열을 2차원배열로 바꿈

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [61]:
# loc 인덱서를 사용할 때 하나의 값만 있다면 행을 선택
df.loc['a']

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [66]:
df.loc[['b', 'c']]  # df는 2차원배열이므로 열과 행을 모두 표현하기위해서- loc인덱서 사용,,,

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [74]:
df.loc['b' : 'c']

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [73]:
# df[행조건식, 열조건식]
# df[['b', 'c']]  <- loc 안써도 추출가능,,
df['b' : 'c']

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [None]:
# 하나 이상의 값은 팩터로 묶습니다... []
# df.loc [['b', 'c']]   >> 인덱스가 b,c인것을 보여줘라
# 인덱서loc 제거하면 키에러가 뜸

In [75]:
df.A > 15  # boolean형 -> True or False

a    False
b    False
c     True
Name: A, dtype: bool

In [76]:
df.loc[df.A > 15]  # 대괄호 안에 조건식을 담음 -> 조건 참이면 출력

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [77]:
df[df.A>15]

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [78]:
def select_rows(df):
    return df.A > 15

In [79]:
select_rows(df)

a    False
b    False
c     True
Name: A, dtype: bool

In [80]:
df.loc[select_rows(df)]   # df[select_rows(df)]  # loc는 행인덱서 처리할때 사용

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [84]:
df.loc['A'] # 'A'는 열 이므로 행인덱서인 loc 사용불가

KeyError: 'A'

In [85]:
df = pd.DataFrame(np.arange(10,22).reshape(3, 4),
                 columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21


In [86]:
df.loc[1:2]

Unnamed: 0,A,B,C,D
1,14,15,16,17
2,18,19,20,21


In [92]:
df.loc[1, 'A']

14

In [96]:
df.loc[1, :]

A    14
B    15
C    16
D    17
Name: 1, dtype: int32

In [97]:
df.loc[1:,'A']

1    14
2    18
Name: A, dtype: int32

In [100]:
df.loc[[0, 1], ['B', 'D']]

Unnamed: 0,B,D
0,11,13
1,15,17


In [101]:
df.loc[df.A > 10, ['C', 'D']]

Unnamed: 0,C,D
1,16,17
2,20,21


In [105]:
df = pd.DataFrame({
    "체중":[80, 70, 65, 55, 52],
    "신장":[180, 177, 169, 190, 155],
    "성별":["남","여","남","여","남"]
})
df

Unnamed: 0,체중,신장,성별
1,80,180,남
2,70,177,여
3,65,169,남
4,55,190,여
5,52,155,남


In [106]:
df['신장']

1    180
2    177
3    169
4    190
5    155
Name: 신장, dtype: int64

In [107]:
df[['체중', '신장']]

Unnamed: 0,체중,신장
1,80,180
2,70,177
3,65,169
4,55,190
5,52,155


In [108]:
df[df.성별 == "남"]

Unnamed: 0,체중,신장,성별
1,80,180,남
3,65,169,남
5,52,155,남


In [109]:
df[df.성별 == "여"]

Unnamed: 0,체중,신장,성별
2,70,177,여
4,55,190,여


In [111]:
data = {
    "names":['홍길동', '이순신', '장보고', '김유신', '강감찬'],
    "year":[2014, 2015, 2016, 2017, 2018],
    "points":[1.5, 1.7, 3.6, 2.4, 2.9]
}

df = pd.DataFrame(data)
df

Unnamed: 0,names,year,points
0,홍길동,2014,1.5
1,이순신,2015,1.7
2,장보고,2016,3.6
3,김유신,2017,2.4
4,강감찬,2018,2.9


In [112]:
# 데이터프레임의 정보 확인
df.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
names     5 non-null object
year      5 non-null int64
points    5 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 200.0+ bytes


In [113]:
# 수치형데이터 기초통계분석
df.describe()
'''
count - 데이터 개수
mean - 평균
std - 표준편차
min - 최소 / max - 최대
quantile(quartile) - 4분위수
'''

Unnamed: 0,year,points
count,5.0,5.0
mean,2016.0,2.42
std,1.581139,0.864292
min,2014.0,1.5
25%,2015.0,1.7
50%,2016.0,2.4
75%,2017.0,2.9
max,2018.0,3.6


In [114]:
# 데이터 개수를 세는 메서드 : count(), NaN값은 세지않는다. NaN-비어있는값, 변칙값, 알수없는.
s = pd.Series(range(10))
s[3] = np.nan
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [117]:
s.count()

9

In [118]:
# 인덱스의 이름 변경
df.index

RangeIndex(start=0, stop=5, step=1)

In [120]:
df.index.name = "Nid"

In [123]:
# 열인덱스의 이름 변경
df.columns.name = "Info"
df

Info,names,year,points
Nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,홍길동,2014,1.5
1,이순신,2015,1.7
2,장보고,2016,3.6
3,김유신,2017,2.4
4,강감찬,2018,2.9


In [126]:
# columns : 열 이름을 목록으로 추출
# index : 행 인덱스의 목록을 추출
# NaN : Not a Number 

df2 = pd.DataFrame(data, 
                   columns=["year", "names", "points", "penalty"],
                  index = ['one', 'two', 'three', 'four', 'five'])
df2

Unnamed: 0,year,names,points,penalty
one,2014,홍길동,1.5,
two,2015,이순신,1.7,
three,2016,장보고,3.6,
four,2017,김유신,2.4,
five,2018,강감찬,2.9,


In [127]:
df["year"]

Nid
0    2014
1    2015
2    2016
3    2017
4    2018
Name: year, dtype: int64

In [129]:
df2.year

one      2014
two      2015
three    2016
four     2017
five     2018
Name: year, dtype: int64

In [131]:
df2[["year", "points"]]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6
four,2017,2.4
five,2018,2.9


In [132]:
df2['penalty'] = 0.7
df2

Unnamed: 0,year,names,points,penalty
one,2014,홍길동,1.5,0.7
two,2015,이순신,1.7,0.7
three,2016,장보고,3.6,0.7
four,2017,김유신,2.4,0.7
five,2018,강감찬,2.9,0.7


In [133]:
df2['penalty'] = [0.5, 0.7, 0.9, 1.0, 0.6]
df2

Unnamed: 0,year,names,points,penalty
one,2014,홍길동,1.5,0.5
two,2015,이순신,1.7,0.7
three,2016,장보고,3.6,0.9
four,2017,김유신,2.4,1.0
five,2018,강감찬,2.9,0.6


In [134]:
df2['ages'] = np.arange(10, 15)
df2

Unnamed: 0,year,names,points,penalty,ages
one,2014,홍길동,1.5,0.5,10
two,2015,이순신,1.7,0.7,11
three,2016,장보고,3.6,0.9,12
four,2017,김유신,2.4,1.0,13
five,2018,강감찬,2.9,0.6,14


In [136]:
del df2['ages']
df2

Unnamed: 0,year,names,points,penalty
one,2014,홍길동,1.5,0.5
two,2015,이순신,1.7,0.7
three,2016,장보고,3.6,0.9
four,2017,김유신,2.4,1.0
five,2018,강감찬,2.9,0.6


In [137]:
# [start:end-1]
df2[0:3]

Unnamed: 0,year,names,points,penalty
one,2014,홍길동,1.5,0.5
two,2015,이순신,1.7,0.7
three,2016,장보고,3.6,0.9


In [140]:
df2.loc['two']

year       2015
names       이순신
points      1.7
penalty     0.7
Name: two, dtype: object

In [141]:
df2.loc['two':'four']

Unnamed: 0,year,names,points,penalty
two,2015,이순신,1.7,0.7
three,2016,장보고,3.6,0.9
four,2017,김유신,2.4,1.0


In [142]:
# loc[행범위, 열범위]
# 범위 => start:end,   : (all)
df2.loc[:,['year', 'names']]

Unnamed: 0,year,names
one,2014,홍길동
two,2015,이순신
three,2016,장보고
four,2017,김유신
five,2018,강감찬


In [147]:
# iloc : 숫자 인덱스를 이용
df2.iloc[3]

year       2017
names       김유신
points      2.4
penalty       1
Name: four, dtype: object

In [148]:
df2.loc['four']  # 라벨링 된거 - loc

year       2017
names       김유신
points      2.4
penalty       1
Name: four, dtype: object

In [151]:
# iloc[행범위, 열범위]
df2.iloc[3:5, 0:2]

Unnamed: 0,year,names
four,2017,김유신
five,2018,강감찬


In [152]:
df2.iloc[:, 1:4]

Unnamed: 0,names,points,penalty
one,홍길동,1.5,0.5
two,이순신,1.7,0.7
three,장보고,3.6,0.9
four,김유신,2.4,1.0
five,강감찬,2.9,0.6


In [153]:
df2.iloc[1, 1]

'이순신'

In [155]:
df2

Unnamed: 0,year,names,points,penalty
one,2014,홍길동,1.5,0.5
two,2015,이순신,1.7,0.7
three,2016,장보고,3.6,0.9
four,2017,김유신,2.4,1.0
five,2018,강감찬,2.9,0.6


In [156]:
df2["year"]

one      2014
two      2015
three    2016
four     2017
five     2018
Name: year, dtype: int64

In [158]:
df2["year"] >2016

one      False
two      False
three    False
four      True
five      True
Name: year, dtype: bool

In [159]:
df2[df2["year"] >2016]

Unnamed: 0,year,names,points,penalty
four,2017,김유신,2.4,1.0
five,2018,강감찬,2.9,0.6


In [160]:
df2.loc[df2["year"] >2016, :]

Unnamed: 0,year,names,points,penalty
four,2017,김유신,2.4,1.0
five,2018,강감찬,2.9,0.6


In [161]:
# 6행 4열   6x4
df = pd.DataFrame(np.random.randn(6, 4))  #randn 정규분포 #dimension차수-> 6x4
df

Unnamed: 0,0,1,2,3
0,0.543993,-1.312224,0.37865,-0.171586
1,-0.526851,2.041508,0.160551,-2.197706
2,-0.476016,-0.994183,-0.689055,0.754954
3,1.452504,0.183125,1.004557,0.44221
4,0.594598,-0.08345,0.430869,-0.215071
5,0.22722,0.972554,-0.7167,-0.525215


In [162]:
# 생성된 데이터프레임에 열인덱스와 행인덱스 값
df.columns = ['A', 'B', 'C', 'D']
# date_range("시작날짜", 옵션)
df.index = pd.date_range("20210419", periods=6)  # periods날짜구분 연도-월-일
df

Unnamed: 0,A,B,C,D
2021-04-19,0.543993,-1.312224,0.37865,-0.171586
2021-04-20,-0.526851,2.041508,0.160551,-2.197706
2021-04-21,-0.476016,-0.994183,-0.689055,0.754954
2021-04-22,1.452504,0.183125,1.004557,0.44221
2021-04-23,0.594598,-0.08345,0.430869,-0.215071
2021-04-24,0.22722,0.972554,-0.7167,-0.525215


In [166]:
# D 컬럼을 삭제하는 작업
# axis = 0 , 행    / 안쓰면 기본적으로 0  / 넘파이는 반대,,,(라이브러리마다 다름)
# axis = 1 , 열
df.drop('D', axis=1)  # 데이터는 바뀌어있지않다 -> df에 담아줘야함

Unnamed: 0,A,B,C
2021-04-19,0.543993,-1.312224,0.37865
2021-04-20,-0.526851,2.041508,0.160551
2021-04-21,-0.476016,-0.994183,-0.689055
2021-04-22,1.452504,0.183125,1.004557
2021-04-23,0.594598,-0.08345,0.430869
2021-04-24,0.22722,0.972554,-0.7167


In [167]:
df

Unnamed: 0,A,B,C,D
2021-04-19,0.543993,-1.312224,0.37865,-0.171586
2021-04-20,-0.526851,2.041508,0.160551,-2.197706
2021-04-21,-0.476016,-0.994183,-0.689055,0.754954
2021-04-22,1.452504,0.183125,1.004557,0.44221
2021-04-23,0.594598,-0.08345,0.430869,-0.215071
2021-04-24,0.22722,0.972554,-0.7167,-0.525215


In [168]:
# B, C 컬럼 삭제
df.drop(['B', 'C'], axis=1)

Unnamed: 0,A,D
2021-04-19,0.543993,-0.171586
2021-04-20,-0.526851,-2.197706
2021-04-21,-0.476016,0.754954
2021-04-22,1.452504,0.44221
2021-04-23,0.594598,-0.215071
2021-04-24,0.22722,-0.525215


In [169]:
df = pd.DataFrame({
    "weight":[80, 70.4, 65.5, 45.9, 51.2],
    "height":[170, 180, 155, 143, 154]
})
df

Unnamed: 0,weight,height
0,80.0,170
1,70.4,180
2,65.5,155
3,45.9,143
4,51.2,154


In [171]:
# 세로방향으로 합계를 구하는 작업, 컬럼별 합계
df.sum(axis=0)

weight    313.0
height    802.0
dtype: float64

In [172]:
# 가로방향의 합계(각 행의 합계)
df.sum(axis=1)

0    250.0
1    250.4
2    220.5
3    188.9
4    205.2
dtype: float64

In [173]:
# 신장의 평균값
df.mean(axis=0)

weight     62.6
height    160.4
dtype: float64

In [176]:
df["height"].mean()

160.4

In [177]:
# 체중의 평균
df["weight"].mean()

62.6

In [179]:
# 분산
df['height'].var()

212.3

In [180]:
#df['height'].std()  # import math; math.sqrt(df['height'].var())

14.570518178843194

In [182]:
# value_counts : 각각의 값의 출현빈도수를 계산하는 메서드
s = pd.Series(np.random.randint(6, size=100))
s.tail()

95    1
96    1
97    1
98    5
99    5
dtype: int32

In [185]:
s.value_counts()   # 많이 쓰임 !!!

0    22
1    21
3    18
2    15
4    14
5    10
dtype: int64

In [186]:
# 데이터 정렬 : sort_index 와 sort_values
# sort_index : 인덱스를 기준으로 정렬
# sort_values : 데이터 값을 기준으로 정렬

# 내림차순 : ascending=False
# by = 열 지정

s.value_counts().sort_index()

0    22
1    21
2    15
3    18
4    14
5    10
dtype: int64

In [196]:
s.value_counts().sort_index(ascending=False)

5    10
4    14
3    18
2    15
1    21
0    22
dtype: int64

In [195]:
s.value_counts().sort_values(ascending=False)

0    22
1    21
3    18
2    15
4    14
5    10
dtype: int64

In [197]:
df

Unnamed: 0,weight,height
0,80.0,170
1,70.4,180
2,65.5,155
3,45.9,143
4,51.2,154


In [201]:
# 체중을 이용해서 정렬
df.sort_values(by="weight", ascending=False)

Unnamed: 0,weight,height
0,80.0,170
1,70.4,180
2,65.5,155
4,51.2,154
3,45.9,143


In [204]:
df.sort_values(by=["weight", "height"]) # 무게가 같으면 신장으로 정렬

Unnamed: 0,weight,height
3,45.9,143
4,51.2,154
2,65.5,155
1,70.4,180
0,80.0,170


In [203]:
df["weight"].sort_values()

3    45.9
4    51.2
2    65.5
1    70.4
0    80.0
Name: weight, dtype: float64

In [211]:
# 행/열 합계
# sum(axis) , axis=0(열방향,세로), axis=1(행방향,가로)
df2 = pd.DataFrame(np.random.randint(10, size=(4, 8)))
df2

Unnamed: 0,0,1,2,3,4,5,6,7
0,4,6,2,2,4,1,7,8
1,4,7,2,5,4,2,8,3
2,2,3,7,1,9,1,8,9
3,3,3,4,5,3,9,7,6


In [212]:
# 행 방향 합계
df2.sum(axis=1)

0    34
1    35
2    40
3    40
dtype: int64

In [213]:
df2['RowSum'] = df2.sum(axis=1)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,4,6,2,2,4,1,7,8,34
1,4,7,2,5,4,2,8,3,35
2,2,3,7,1,9,1,8,9,40
3,3,3,4,5,3,9,7,6,40


In [214]:
df2.sum()  # axis=0 은 생략가능, default

0          13
1          19
2          15
3          13
4          20
5          13
6          30
7          26
RowSum    149
dtype: int64

In [215]:
df2.loc['ColSum', :] = df2.sum()
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,4.0,6.0,2.0,2.0,4.0,1.0,7.0,8.0,34.0
1,4.0,7.0,2.0,5.0,4.0,2.0,8.0,3.0,35.0
2,2.0,3.0,7.0,1.0,9.0,1.0,8.0,9.0,40.0
3,3.0,3.0,4.0,5.0,3.0,9.0,7.0,6.0,40.0
ColSum,13.0,19.0,15.0,13.0,20.0,13.0,30.0,26.0,149.0


In [216]:
# apply() : 행과 열을 반복해서 특정 함수를 이용해 작업할 때
df3 = pd.DataFrame({
    'A':[1, 3, 4, 3, 4],
    'B':[2, 3, 1, 2, 3],
    'C':[1, 5, 2, 4, 4],
})
df3

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [217]:
# 각 컬럼별 최댓값에서 최솟값의 차이를 구하는 작업
#  lambda 입력 : 출력
# func = lambda x : x.max() - x.min()
# df3.apply(func, axis=1)
df3.apply(lambda x: x.max() - x.min())

A    3
B    2
C    4
dtype: int64

In [220]:
# 각 행별 최댓값에서 최솟값의 차이를 구하는 작업
df3.apply(lambda x: x.max() - x.min(), axis=1)

0    1
1    2
2    3
3    2
4    1
dtype: int64

In [225]:
# value_counts
# 각 열에 어떤 값이 얼마나 사용되었는지 알고싶다면
df3.apply(pd.value_counts) 

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [226]:
# 정제 : 결측치나 이상치
# NaN => fillna(value)
df3.apply(pd.value_counts).fillna(0.0) # NaN을 0.0으로 채워라

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [227]:
# is~
# as~ 변환, astype(자료형)   # 뒤에 써있는 자료형으로 형변환
df3.apply(pd.value_counts).fillna(0.0).astype(int)

Unnamed: 0,A,B,C
1,1,1,1
2,0,2,1
3,2,2,0
4,2,0,2
5,0,0,1


In [231]:
# 매직명령어(magic)  - %%로 시작하는 명령어,,
%%writefile sample.csv
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

UsageError: Line magic function `%%writefile` not found.


In [233]:
df4 = pd.read_csv('sample1.csv')
df4

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [234]:
df5 = pd.read_csv('sample2.csv')
df5

Unnamed: 0,1,1.11,one
0,2,2.22,two
1,3,3.33,three


In [235]:
df5 = pd.read_csv('sample2.csv', header = None)
df5

Unnamed: 0,0,1,2
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [236]:
df6 = pd.read_csv('sample2.csv', names = ['c1', 'c2', 'c3'])
df6

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three
