In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series([1, 2, 3, np.nan, 6, 8]) # nan = not a number
print(s)

0    1.0
1    2.0
2    3.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [3]:
dates = pd.date_range('20230613', periods=5) # date_range 함수가 str을 date로 포맷
print(dates)

DatetimeIndex(['2023-06-13', '2023-06-14', '2023-06-15', '2023-06-16',
               '2023-06-17'],
              dtype='datetime64[ns]', freq='D')


In [4]:
df = pd.DataFrame(np.random.randn(5, 4), index=dates, columns=('A', 'B', 'C', 'D'))
df

Unnamed: 0,A,B,C,D
2023-06-13,1.367007,-0.089651,1.370948,0.101234
2023-06-14,0.181241,2.01602,0.97099,-1.777475
2023-06-15,0.292653,-0.240686,0.568447,-0.327431
2023-06-16,0.092726,0.240113,0.409826,2.039473
2023-06-17,-1.66958,-0.755662,0.951344,-1.889437


In [5]:
df.head(3) # 제일 위의 column만 출력 가능

Unnamed: 0,A,B,C,D
2023-06-13,1.367007,-0.089651,1.370948,0.101234
2023-06-14,0.181241,2.01602,0.97099,-1.777475
2023-06-15,0.292653,-0.240686,0.568447,-0.327431


In [7]:
print(df.index)

DatetimeIndex(['2023-06-13', '2023-06-14', '2023-06-15', '2023-06-16',
               '2023-06-17'],
              dtype='datetime64[ns]', freq='D')


In [8]:
print(df.columns)

Index(['A', 'B', 'C', 'D'], dtype='object')


In [10]:
print(df.values) # value값을 배열로 출력

[[ 1.36700729 -0.08965102  1.37094814  0.10123357]
 [ 0.18124055  2.01602028  0.97098974 -1.77747543]
 [ 0.29265346 -0.24068556  0.56844665 -0.3274309 ]
 [ 0.09272617  0.24011312  0.40982648  2.03947312]
 [-1.66957962 -0.75566228  0.95134385 -1.88943695]]


In [11]:
df.info() # 데이터 프레임의 정보와 각 데이터의 타입을 보여 준다.

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2023-06-13 to 2023-06-17
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       5 non-null      float64
 1   B       5 non-null      float64
 2   C       5 non-null      float64
 3   D       5 non-null      float64
dtypes: float64(4)
memory usage: 200.0 bytes


In [12]:
df.describe()

Unnamed: 0,A,B,C,D
count,5.0,5.0,5.0,5.0
mean,0.05281,0.234027,0.854311,-0.370727
std,1.091942,1.058908,0.377236,1.606168
min,-1.66958,-0.755662,0.409826,-1.889437
25%,0.092726,-0.240686,0.568447,-1.777475
50%,0.181241,-0.089651,0.951344,-0.327431
75%,0.292653,0.240113,0.97099,0.101234
max,1.367007,2.01602,1.370948,2.039473


In [13]:
# B 컬럼의 값을 기준으로 내림차순 정렬하기
df.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C,D
2023-06-14,0.181241,2.01602,0.97099,-1.777475
2023-06-16,0.092726,0.240113,0.409826,2.039473
2023-06-13,1.367007,-0.089651,1.370948,0.101234
2023-06-15,0.292653,-0.240686,0.568447,-0.327431
2023-06-17,-1.66958,-0.755662,0.951344,-1.889437


In [15]:
print(df.A)

2023-06-13    1.367007
2023-06-14    0.181241
2023-06-15    0.292653
2023-06-16    0.092726
2023-06-17   -1.669580
Freq: D, Name: A, dtype: float64


In [23]:
print(df['A'])

2023-06-13    1.367007
2023-06-14    0.181241
2023-06-15    0.292653
2023-06-16    0.092726
2023-06-17   -1.669580
Freq: D, Name: A, dtype: float64


In [21]:
df[1:4]

Unnamed: 0,A,B,C,D
2023-06-14,0.181241,2.01602,0.97099,-1.777475
2023-06-15,0.292653,-0.240686,0.568447,-0.327431
2023-06-16,0.092726,0.240113,0.409826,2.039473


In [22]:
df['20130614':'20230616']

Unnamed: 0,A,B,C,D
2023-06-13,1.367007,-0.089651,1.370948,0.101234
2023-06-14,0.181241,2.01602,0.97099,-1.777475
2023-06-15,0.292653,-0.240686,0.568447,-0.327431
2023-06-16,0.092726,0.240113,0.409826,2.039473


In [24]:
# A, B열의 모든 행을 출력하기

df.loc[: ,['A', 'B']]

Unnamed: 0,A,B
2023-06-13,1.367007,-0.089651
2023-06-14,0.181241,2.01602
2023-06-15,0.292653,-0.240686
2023-06-16,0.092726,0.240113
2023-06-17,-1.66958,-0.755662


In [25]:
# 06 14 ~ 15 A, C 컬럼 값

df.loc['20230614':'20230615', ['A', 'C']]

Unnamed: 0,A,C
2023-06-14,0.181241,0.97099
2023-06-15,0.292653,0.568447


In [26]:
df.iloc[3:4, 0:2]

Unnamed: 0,A,B
2023-06-16,0.092726,0.240113


In [27]:
df.iloc[[0,1,3],[0,3]]

Unnamed: 0,A,D
2023-06-13,1.367007,0.101234
2023-06-14,0.181241,-1.777475
2023-06-16,0.092726,2.039473


In [28]:
df.iloc[[0,1,3], 1:3]

Unnamed: 0,B,C
2023-06-13,-0.089651,1.370948
2023-06-14,2.01602,0.97099
2023-06-16,0.240113,0.409826


In [32]:
# A컬럼의 값이 0보다 큰 row만 출력

df[df.A > 0]

Unnamed: 0,A,B,C,D
2023-06-13,1.367007,-0.089651,1.370948,0.101234
2023-06-14,0.181241,2.01602,0.97099,-1.777475
2023-06-15,0.292653,-0.240686,0.568447,-0.327431
2023-06-16,0.092726,0.240113,0.409826,2.039473


In [33]:
df[df > 0] # 데이터 전체에 조건을 걸면 조건에 만족하지 않는 데이터는 NaN 처리

Unnamed: 0,A,B,C,D
2023-06-13,1.367007,,1.370948,0.101234
2023-06-14,0.181241,2.01602,0.97099,
2023-06-15,0.292653,,0.568447,
2023-06-16,0.092726,0.240113,0.409826,2.039473
2023-06-17,,,0.951344,


In [34]:
df2 = df.copy() # 주소값이 아닌 전체 데이터를 복사해 별개의 개체를 만들 수 있다.
print(df is df2)

False


- 새로운 컬럼 추가하기

In [37]:
df2["E"] = ['one', 'two', 'three', 'four', 'five']
df2

Unnamed: 0,A,B,C,D,E
2023-06-13,1.367007,-0.089651,1.370948,0.101234,one
2023-06-14,0.181241,2.01602,0.97099,-1.777475,two
2023-06-15,0.292653,-0.240686,0.568447,-0.327431,three
2023-06-16,0.092726,0.240113,0.409826,2.039473,four
2023-06-17,-1.66958,-0.755662,0.951344,-1.889437,five


In [39]:
# isin(): 컬럼에 해당 값을 가지고 있는 요소만 True 반환
print(df2['E'].isin(['two', 'three']))

2023-06-13    False
2023-06-14     True
2023-06-15     True
2023-06-16    False
2023-06-17    False
Freq: D, Name: E, dtype: bool


In [40]:
df2[df2['E'].isin(['two', 'three'])]

Unnamed: 0,A,B,C,D,E
2023-06-14,0.181241,2.01602,0.97099,-1.777475,two
2023-06-15,0.292653,-0.240686,0.568447,-0.327431,three


In [41]:
df

Unnamed: 0,A,B,C,D
2023-06-13,1.367007,-0.089651,1.370948,0.101234
2023-06-14,0.181241,2.01602,0.97099,-1.777475
2023-06-15,0.292653,-0.240686,0.568447,-0.327431
2023-06-16,0.092726,0.240113,0.409826,2.039473
2023-06-17,-1.66958,-0.755662,0.951344,-1.889437


In [42]:
# 누적 합 구하기
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2023-06-13,1.367007,-0.089651,1.370948,0.101234
2023-06-14,1.548248,1.926369,2.341938,-1.676242
2023-06-15,1.840901,1.685684,2.910385,-2.003673
2023-06-16,1.933627,1.925797,3.320211,0.0358
2023-06-17,0.264048,1.170135,4.271555,-1.853637


In [44]:
# 최대값과 최소값의 차 출력하기
print(df.apply(lambda x: x.max() - x.min()))

A    3.036587
B    2.771683
C    0.961122
D    3.928910
dtype: float64
