In [1]:
import pandas as pd
import numpy as np

In [2]:
dates = pd.date_range('20200625', periods=6)
dates

DatetimeIndex(['2020-06-25', '2020-06-26', '2020-06-27', '2020-06-28',
               '2020-06-29', '2020-06-30'],
              dtype='datetime64[ns]', freq='D')

In [3]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2020-06-25,-0.363182,0.668161,1.875191,-0.888778
2020-06-26,1.392618,-0.62876,-0.651513,0.214553
2020-06-27,-0.609972,-0.079751,-1.509384,0.500237
2020-06-28,-0.242563,-1.108095,1.975162,-0.810782
2020-06-29,-0.659872,0.503981,2.520272,1.137402
2020-06-30,0.383446,0.762964,-0.790644,0.167684


In [4]:
df2 = df.copy()
df2
# df는 기본적으로 행, 열의 위치값을 가지고 있음
# '기본적으로 가지고 있는 위치값(index)'를 이용하여 조작하는 경우 iloc[]를 사용!
# 일부 그래프나 분석 시 특정항목을 index항목으로 수동으로 설정해야하는 경우가 있음
# 특정 항목을 index항목으로 설정가능한 경우는 unique/not null인 값을 가지는 항목만 설정 가능!
# user_id, bbs_id pk조건과 일치!

Unnamed: 0,A,B,C,D
2020-06-25,-0.363182,0.668161,1.875191,-0.888778
2020-06-26,1.392618,-0.62876,-0.651513,0.214553
2020-06-27,-0.609972,-0.079751,-1.509384,0.500237
2020-06-28,-0.242563,-1.108095,1.975162,-0.810782
2020-06-29,-0.659872,0.503981,2.520272,1.137402
2020-06-30,0.383446,0.762964,-0.790644,0.167684


In [5]:
df2.reset_index(inplace=True)
df2

Unnamed: 0,index,A,B,C,D
0,2020-06-25,-0.363182,0.668161,1.875191,-0.888778
1,2020-06-26,1.392618,-0.62876,-0.651513,0.214553
2,2020-06-27,-0.609972,-0.079751,-1.509384,0.500237
3,2020-06-28,-0.242563,-1.108095,1.975162,-0.810782
4,2020-06-29,-0.659872,0.503981,2.520272,1.137402
5,2020-06-30,0.383446,0.762964,-0.790644,0.167684


In [6]:
df2.set_index('index', inplace=True)
df2

Unnamed: 0_level_0,A,B,C,D
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06-25,-0.363182,0.668161,1.875191,-0.888778
2020-06-26,1.392618,-0.62876,-0.651513,0.214553
2020-06-27,-0.609972,-0.079751,-1.509384,0.500237
2020-06-28,-0.242563,-1.108095,1.975162,-0.810782
2020-06-29,-0.659872,0.503981,2.520272,1.137402
2020-06-30,0.383446,0.762964,-0.790644,0.167684


In [7]:
df2['E'] = df2['A'] + 1 # 브로드캐스팅

In [8]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [9]:
df2['E']

index
2020-06-25    0.636818
2020-06-26    2.392618
2020-06-27    0.390028
2020-06-28    0.757437
2020-06-29    0.340128
2020-06-30    1.383446
Name: E, dtype: float64

In [10]:
df2['F'] = 0
df2['F']

index
2020-06-25    0
2020-06-26    0
2020-06-27    0
2020-06-28    0
2020-06-29    0
2020-06-30    0
Name: F, dtype: int64

In [11]:
df2['G'] = range(0, 6)
df2['G']

index
2020-06-25    0
2020-06-26    1
2020-06-27    2
2020-06-28    3
2020-06-29    4
2020-06-30    5
Name: G, dtype: int32

In [12]:
# 인덱스를 변경하고 싶은 경우, 기존의 인덱스를 살리고 싶으면
# 먼저 reset_index()하고 나서 해야함
# reset을 안하는 경우 기존 index는 사라짐
df2.set_index('G', inplace=True)

In [13]:
df2

Unnamed: 0_level_0,A,B,C,D,E,F
G,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,-0.363182,0.668161,1.875191,-0.888778,0.636818,0
1,1.392618,-0.62876,-0.651513,0.214553,2.392618,0
2,-0.609972,-0.079751,-1.509384,0.500237,0.390028,0
3,-0.242563,-1.108095,1.975162,-0.810782,0.757437,0
4,-0.659872,0.503981,2.520272,1.137402,0.340128,0
5,0.383446,0.762964,-0.790644,0.167684,1.383446,0


In [14]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [15]:
df3 = df.copy()
df3

Unnamed: 0,A,B,C,D
2020-06-25,-0.363182,0.668161,1.875191,-0.888778
2020-06-26,1.392618,-0.62876,-0.651513,0.214553
2020-06-27,-0.609972,-0.079751,-1.509384,0.500237
2020-06-28,-0.242563,-1.108095,1.975162,-0.810782
2020-06-29,-0.659872,0.503981,2.520272,1.137402
2020-06-30,0.383446,0.762964,-0.790644,0.167684


In [16]:
# E항목을 파생변수로 만드세요. 1~10 범위 값
# F항목을 파생변수로 만드세요. A열과 C열의 합
# H항목을 파생변수로 만드세요. 1로 설정
# Z항목을 파생변수로 만드세요. 1~9까지 랜덤한 값으로 설정

In [17]:
import random

In [18]:
# 방법 1. 
data = range(1, 11) # 1 ~ 10
data2 = []
for x in range(6):
    data2.append(random.choice(data))
df3['E'] = data2
# 방법 2. df3['E'] = [1, 3, 5, 7, 9, 4]
df3['E']

2020-06-25    4
2020-06-26    1
2020-06-27    4
2020-06-28    2
2020-06-29    4
2020-06-30    6
Freq: D, Name: E, dtype: int64

In [19]:
df3['F'] = df3['A'] + df3['C']
df3['F']

2020-06-25    1.512009
2020-06-26    0.741104
2020-06-27   -2.119356
2020-06-28    1.732598
2020-06-29    1.860400
2020-06-30   -0.407198
Freq: D, Name: F, dtype: float64

In [20]:
df3['H'] = 1
df3['H']

2020-06-25    1
2020-06-26    1
2020-06-27    1
2020-06-28    1
2020-06-29    1
2020-06-30    1
Freq: D, Name: H, dtype: int64

In [21]:
range(10)

range(0, 10)

In [22]:
list(range(10))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [23]:
# ['반복하면서 한번에 처리할 내용' for i in range(10)]
[i + 1 for i in range(10)]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [24]:
# ['반복하면서 한번에 처리할 내용' for i in range(10) if i/2 == 0]
[i + 1 for i in range(10) if i%2 == 0]

# data3 = []
# for i in range(10):
#     if i%2 == 0:
#         data3.append(i)
# data3

[1, 3, 5, 7, 9]

In [25]:
# 리스트 내포: 결과가 항상 list
# list comprehension
# 방법 1.
df3['Z'] = [random.randint(1, 10) for _ in range(6)]
df3['Z']
# 방법2 .
# data2 = []
# for _ in range(6):
#     data2.append(random.randint(1,10))
# df3['Z'] = data2

2020-06-25    3
2020-06-26    7
2020-06-27    4
2020-06-28    8
2020-06-29    2
2020-06-30    9
Freq: D, Name: Z, dtype: int64

In [26]:
df3

Unnamed: 0,A,B,C,D,E,F,H,Z
2020-06-25,-0.363182,0.668161,1.875191,-0.888778,4,1.512009,1,3
2020-06-26,1.392618,-0.62876,-0.651513,0.214553,1,0.741104,1,7
2020-06-27,-0.609972,-0.079751,-1.509384,0.500237,4,-2.119356,1,4
2020-06-28,-0.242563,-1.108095,1.975162,-0.810782,2,1.732598,1,8
2020-06-29,-0.659872,0.503981,2.520272,1.137402,4,1.8604,1,2
2020-06-30,0.383446,0.762964,-0.790644,0.167684,6,-0.407198,1,9


In [27]:
name = ["홍길동","김길동","송길동"]
for x in name:
    print(x)

홍길동
김길동
송길동


In [28]:
# name에 들어있는 각각의 모든 데이터에 뒤에 "님 글자를 붙여주세요!

In [29]:
[ x+'님' for x in name ]

['홍길동님', '김길동님', '송길동님']

In [30]:
# name에 들어있는 각각의 모든 데이터에 앞에 "신입" 글자를 붙여주세요!

In [31]:
[ "신입" + x for x in name ]

['신입홍길동', '신입김길동', '신입송길동']

In [32]:
a_list = range(1, 100, 2)

In [33]:
# a_list에 몇 개가 들어있는지 세어보세요.

In [34]:
count = 0
result = [ count + 1 for x in a_list ]
sum(result)

50

In [35]:
# a_list의 각각의 모든 데이터에 0.1을 곱해보세요. 변경된 a_list의 합과 평균

In [36]:
a_list = [x * 0.1 for x in a_list ]
a_list

[0.1,
 0.30000000000000004,
 0.5,
 0.7000000000000001,
 0.9,
 1.1,
 1.3,
 1.5,
 1.7000000000000002,
 1.9000000000000001,
 2.1,
 2.3000000000000003,
 2.5,
 2.7,
 2.9000000000000004,
 3.1,
 3.3000000000000003,
 3.5,
 3.7,
 3.9000000000000004,
 4.1000000000000005,
 4.3,
 4.5,
 4.7,
 4.9,
 5.1000000000000005,
 5.300000000000001,
 5.5,
 5.7,
 5.9,
 6.1000000000000005,
 6.300000000000001,
 6.5,
 6.7,
 6.9,
 7.1000000000000005,
 7.300000000000001,
 7.5,
 7.7,
 7.9,
 8.1,
 8.3,
 8.5,
 8.700000000000001,
 8.9,
 9.1,
 9.3,
 9.5,
 9.700000000000001,
 9.9]

In [37]:
a_list

[0.1,
 0.30000000000000004,
 0.5,
 0.7000000000000001,
 0.9,
 1.1,
 1.3,
 1.5,
 1.7000000000000002,
 1.9000000000000001,
 2.1,
 2.3000000000000003,
 2.5,
 2.7,
 2.9000000000000004,
 3.1,
 3.3000000000000003,
 3.5,
 3.7,
 3.9000000000000004,
 4.1000000000000005,
 4.3,
 4.5,
 4.7,
 4.9,
 5.1000000000000005,
 5.300000000000001,
 5.5,
 5.7,
 5.9,
 6.1000000000000005,
 6.300000000000001,
 6.5,
 6.7,
 6.9,
 7.1000000000000005,
 7.300000000000001,
 7.5,
 7.7,
 7.9,
 8.1,
 8.3,
 8.5,
 8.700000000000001,
 8.9,
 9.1,
 9.3,
 9.5,
 9.700000000000001,
 9.9]

In [38]:
np.sum(a_list) # 합

250.0

In [39]:
np.mean(a_list) # 평균

5.0

In [40]:
np.median(a_list) # 중간값

5.0

In [41]:
np.std(a_list) # 표준편차

2.8861739379323628

In [42]:
[ x * 0.1 for x in a_list][:3]

[0.010000000000000002, 0.030000000000000006, 0.05]

In [43]:
df3

Unnamed: 0,A,B,C,D,E,F,H,Z
2020-06-25,-0.363182,0.668161,1.875191,-0.888778,4,1.512009,1,3
2020-06-26,1.392618,-0.62876,-0.651513,0.214553,1,0.741104,1,7
2020-06-27,-0.609972,-0.079751,-1.509384,0.500237,4,-2.119356,1,4
2020-06-28,-0.242563,-1.108095,1.975162,-0.810782,2,1.732598,1,8
2020-06-29,-0.659872,0.503981,2.520272,1.137402,4,1.8604,1,2
2020-06-30,0.383446,0.762964,-0.790644,0.167684,6,-0.407198,1,9


In [44]:
df3.drop(columns=['E', 'F', 'H', 'Z'], inplace=True) 

In [45]:
df3

Unnamed: 0,A,B,C,D
2020-06-25,-0.363182,0.668161,1.875191,-0.888778
2020-06-26,1.392618,-0.62876,-0.651513,0.214553
2020-06-27,-0.609972,-0.079751,-1.509384,0.500237
2020-06-28,-0.242563,-1.108095,1.975162,-0.810782
2020-06-29,-0.659872,0.503981,2.520272,1.137402
2020-06-30,0.383446,0.762964,-0.790644,0.167684


In [46]:
df3.index

DatetimeIndex(['2020-06-25', '2020-06-26', '2020-06-27', '2020-06-28',
               '2020-06-29', '2020-06-30'],
              dtype='datetime64[ns]', freq='D')

In [47]:
df3.loc['2020-06-25']

A   -0.363182
B    0.668161
C    1.875191
D   -0.888778
Name: 2020-06-25 00:00:00, dtype: float64

In [48]:
df3.reset_index(inplace=True)
df3

Unnamed: 0,index,A,B,C,D
0,2020-06-25,-0.363182,0.668161,1.875191,-0.888778
1,2020-06-26,1.392618,-0.62876,-0.651513,0.214553
2,2020-06-27,-0.609972,-0.079751,-1.509384,0.500237
3,2020-06-28,-0.242563,-1.108095,1.975162,-0.810782
4,2020-06-29,-0.659872,0.503981,2.520272,1.137402
5,2020-06-30,0.383446,0.762964,-0.790644,0.167684


In [49]:
df3.set_index('index', inplace=True)
df3

Unnamed: 0_level_0,A,B,C,D
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06-25,-0.363182,0.668161,1.875191,-0.888778
2020-06-26,1.392618,-0.62876,-0.651513,0.214553
2020-06-27,-0.609972,-0.079751,-1.509384,0.500237
2020-06-28,-0.242563,-1.108095,1.975162,-0.810782
2020-06-29,-0.659872,0.503981,2.520272,1.137402
2020-06-30,0.383446,0.762964,-0.790644,0.167684


In [50]:
df3.index[0]

Timestamp('2020-06-25 00:00:00')

In [51]:
df3.drop(pd.Timestamp('2020-06-25 00:00:00'), inplace=True)

In [52]:
df3

Unnamed: 0_level_0,A,B,C,D
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06-26,1.392618,-0.62876,-0.651513,0.214553
2020-06-27,-0.609972,-0.079751,-1.509384,0.500237
2020-06-28,-0.242563,-1.108095,1.975162,-0.810782
2020-06-29,-0.659872,0.503981,2.520272,1.137402
2020-06-30,0.383446,0.762964,-0.790644,0.167684


In [53]:
df3.loc['2020-06-25'] = [1,1,1,1] # 열의 순서대로 들어감

In [54]:
df3

Unnamed: 0_level_0,A,B,C,D
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06-26 00:00:00,1.392618,-0.62876,-0.651513,0.214553
2020-06-27 00:00:00,-0.609972,-0.079751,-1.509384,0.500237
2020-06-28 00:00:00,-0.242563,-1.108095,1.975162,-0.810782
2020-06-29 00:00:00,-0.659872,0.503981,2.520272,1.137402
2020-06-30 00:00:00,0.383446,0.762964,-0.790644,0.167684
2020-06-25,1.0,1.0,1.0,1.0
