# Numpy

In [1]:
import numpy as np

## np.ndarray

In [2]:
v1: np.ndarray = np.array([1, 2, 3, 4])
# ndarray = N Dimension Array

In [3]:
v1

array([1, 2, 3, 4])

In [4]:
type(v1)

numpy.ndarray

In [5]:
v1.dtype
# data type

dtype('int64')

In [6]:
v1.shape
# 각 차원에 있는 아이템의 개수
# (4,): 1차원 내에 4개의 아이템

(4,)

In [7]:
v2 = np.array( [ [1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12] ] )
v2

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [8]:
v2.shape

(4, 3)

In [9]:
v2[0]

array([1, 2, 3])

In [10]:
v2[1, 2]

6

In [11]:
v2[1][2]

6

In [12]:
v2[:, 2]

array([ 3,  6,  9, 12])

In [13]:
v2[1:3, 2]

array([6, 9])

## `np.arange`

In [14]:
vr1: np.ndarray = np.arange(5)
vr2: np.ndarray = np.arange(1, 10, 2)

In [15]:
vr1

array([0, 1, 2, 3, 4])

In [16]:
vr2

array([1, 3, 5, 7, 9])

`np.ndarray`에 연산 처리 시 파이썬 리스트와 달리 각 아이템에 연산 적용 (Broadcasting)

In [17]:
vr3: np.ndarray = np.arange(1, 10, 2) * 2
vr4: np.ndarray = np.arange(10, -1, -1) ** 2

In [18]:
vr3

array([ 2,  6, 10, 14, 18])

In [19]:
vr4

array([100,  81,  64,  49,  36,  25,  16,   9,   4,   1,   0])

In [20]:
vr4 *= 3
vr4

array([300, 243, 192, 147, 108,  75,  48,  27,  12,   3,   0])

## `np.reshape`

In [21]:
vs1: np.ndarray = np.arange(12)
vs2: np.ndarray = vs1.reshape(3, 4)

In [22]:
vs1

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [23]:
vs2

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [24]:
vs3: np.ndarray = vs1.reshape(4, -1)
# -1: 마지막 남은 차원 하나는 자동으로 reshape
vs3

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [25]:
vs4: np.ndarray = vs1.reshape(2, 6, order='F') # C & Fortran 방식: 열 방향으로 정렬
vs4

array([[ 0,  2,  4,  6,  8, 10],
       [ 1,  3,  5,  7,  9, 11]])

\* 메서드 체이닝 가능

In [26]:
vc1: np.ndarray = \
  np.arange(12) \
  .reshape(2, 6) \
  .reshape(4, -1)
vc1

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

## `np.ndarray`의 통계 메서드

In [27]:
vc: np.ndarray = np.arange(1, 5).reshape(2, -1)
vc

array([[1, 2],
       [3, 4]])

In [28]:
np.max(vc)

4

In [29]:
np.max(vc, axis=1) # 행/열 축 지정

array([2, 4])

In [30]:
np.min(vc)

1

In [31]:
np.mean(vc)

2.5

In [32]:
np.std(vc)

1.118033988749895

## `np.ndarray`의 사칙연산

In [33]:
vc

array([[1, 2],
       [3, 4]])

In [34]:
np.add(vc, vc) # element-wise

array([[2, 4],
       [6, 8]])

In [35]:
np.subtract(vc, vc) # element-wise

array([[0, 0],
       [0, 0]])

In [36]:
np.multiply(vc, vc) # element-wise

array([[ 1,  4],
       [ 9, 16]])

In [37]:
np.divide(vc, vc) # element-wise

array([[1., 1.],
       [1., 1.]])

In [38]:
np.dot(vc, vc) # dot-product

array([[ 7, 10],
       [15, 22]])

# Pandas

In [39]:
import pandas as pd

## Series

In [40]:
s: pd.Series = pd.Series([1, 3, 5, 7]) # 키 값이 있는 리스트
s

0    1
1    3
2    5
3    7
dtype: int64

In [41]:
s.values

array([1, 3, 5, 7])

In [42]:
s.index # 시리즈의 인덱스 키 규칙

RangeIndex(start=0, stop=4, step=1)

In [43]:
s2: pd.Series = pd.Series([1, 3, 5, 7], index=['a', 'b', 'c', 'd'])
s2

a    1
b    3
c    5
d    7
dtype: int64

## DataFrame

In [44]:
from google.colab import drive
drive.mount('/content/gdrive')
PATH = '/content/gdrive/MyDrive/Workspaces/인공지능 중급과정 1/1. numpy pandas'

Mounted at /content/gdrive


In [45]:
df: pd.DataFrame = pd.read_excel(f'{PATH}/cosmetics_.xlsx')
df.head()

Unnamed: 0,gender,marriage,edu,job,mincome,aware,count,amount,decision,propensity,skin,promo,location,satisf_b,satisf_i,satisf_al,repurchase
0,1,1,4,1,2,2,1,11000,2,1,1,1,2,5,2,2,2
1,2,1,4,9,2,1,4,30000,1,1,3,2,3,2,3,3,4
2,2,2,4,4,3,1,6,100000,3,2,3,2,2,4,5,4,4
3,2,2,4,7,5,2,6,65000,3,2,5,2,3,3,4,4,4
4,1,2,6,6,5,2,2,50000,2,2,3,2,3,3,3,3,3


In [46]:
df['amount']

0        11000
1        30000
2       100000
3        65000
4        50000
        ...   
242       4000
243     150000
244     100000
245      20000
246    1000000
Name: amount, Length: 247, dtype: int64

In [47]:
df.loc[0]['amount'] # df.loc[key] => df에서 key로 인덱싱 되는 각 행

11000

In [48]:
df.loc[0, 'amount'] # loc[row, column] => Numpy 형식으로 인덱싱 가능

11000

In [49]:
df.at[0, 'amount'] # loc와 비슷함. getter, setter 모두 있음. 속도 빠름

11000

In [50]:
df.iloc[0, 7] # index-location. 키 대신 인덱스 값 사용

11000

**DataFrame 각 행에 대해 연산 처리 가능**

연산 처리 및 새 열 추가

In [51]:
df['total_payment'] = df['amount'] * df['count']
df.head()

Unnamed: 0,gender,marriage,edu,job,mincome,aware,count,amount,decision,propensity,skin,promo,location,satisf_b,satisf_i,satisf_al,repurchase,total_payment
0,1,1,4,1,2,2,1,11000,2,1,1,1,2,5,2,2,2,11000
1,2,1,4,9,2,1,4,30000,1,1,3,2,3,2,3,3,4,120000
2,2,2,4,4,3,1,6,100000,3,2,3,2,2,4,5,4,4,600000
3,2,2,4,7,5,2,6,65000,3,2,5,2,3,3,4,4,4,390000
4,1,2,6,6,5,2,2,50000,2,2,3,2,3,3,3,3,3,100000


연산 처리 및 새 열 삽입

In [52]:
df.insert(8, 'total_payment_', df['amount'] * df['count']) # 8번 인덱스 뒤에 추가
df.head()

Unnamed: 0,gender,marriage,edu,job,mincome,aware,count,amount,total_payment_,decision,propensity,skin,promo,location,satisf_b,satisf_i,satisf_al,repurchase,total_payment
0,1,1,4,1,2,2,1,11000,11000,2,1,1,1,2,5,2,2,2,11000
1,2,1,4,9,2,1,4,30000,120000,1,1,3,2,3,2,3,3,4,120000
2,2,2,4,4,3,1,6,100000,600000,3,2,3,2,2,4,5,4,4,600000
3,2,2,4,7,5,2,6,65000,390000,3,2,5,2,3,3,4,4,4,390000
4,1,2,6,6,5,2,2,50000,100000,2,2,3,2,3,3,3,3,3,100000


열 제거

In [53]:
del df['total_payment']
del df['total_payment_']

**필터링**  
인덱스 키에 조건식 추가 시 조건식에 해당하는 열만 선택

In [54]:
df_gender: pd.DataFrame = df[ df['gender'] == 1 ]
df_gender

Unnamed: 0,gender,marriage,edu,job,mincome,aware,count,amount,decision,propensity,skin,promo,location,satisf_b,satisf_i,satisf_al,repurchase
0,1,1,4,1,2,2,1,11000,2,1,1,1,2,5,2,2,2
4,1,2,6,6,5,2,2,50000,2,2,3,2,3,3,3,3,3
7,1,1,6,4,5,4,10,39000,3,2,2,1,2,4,4,4,4
11,1,1,2,5,3,2,1,30000,3,2,3,2,2,3,3,3,3
14,1,2,4,4,6,2,2,60000,3,2,1,2,5,3,3,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,1,2,4,4,3,2,2,50000,1,2,5,2,2,3,4,3,3
238,1,2,4,4,5,2,1,80000,2,2,1,2,3,3,3,3,3
241,1,1,2,1,2,2,5,3000,1,1,2,1,1,1,1,1,1
242,1,1,4,2,3,2,6,4000,1,1,1,1,4,2,1,1,1


In [55]:
df_gender.describe() # `df_gender`에 대한 통계

Unnamed: 0,gender,marriage,edu,job,mincome,aware,count,amount,decision,propensity,skin,promo,location,satisf_b,satisf_i,satisf_al,repurchase
count,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0
mean,1.0,1.704545,4.590909,4.098485,4.136364,4.219697,3.227273,127757.6,2.371212,1.909091,2.621212,1.992424,2.545455,2.840909,3.378788,3.439394,3.462121
std,0.0,0.457985,1.666713,2.157394,1.614688,6.782684,3.242351,448480.4,0.775634,0.63552,1.42245,0.88668,1.043801,0.789526,0.796273,0.712565,0.755694
min,1.0,1.0,2.0,1.0,1.0,1.0,1.0,3000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,4.0,3.0,3.0,2.0,1.0,30000.0,2.0,1.75,1.0,1.0,2.0,2.0,3.0,3.0,3.0
50%,1.0,2.0,4.0,4.0,4.0,2.0,2.0,50000.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0,4.0
75%,1.0,2.0,6.0,4.25,6.0,2.0,3.0,100000.0,3.0,2.0,4.0,2.0,3.0,3.0,4.0,4.0,4.0
max,1.0,2.0,8.0,10.0,6.0,31.0,20.0,5000000.0,3.0,3.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0


In [56]:
df_reacount: pd.DataFrame = df[ (df['count'] < 10) & (df['count'] >= 1) ] # and 대신 & 사용
df_reacount.head()

Unnamed: 0,gender,marriage,edu,job,mincome,aware,count,amount,decision,propensity,skin,promo,location,satisf_b,satisf_i,satisf_al,repurchase
0,1,1,4,1,2,2,1,11000,2,1,1,1,2,5,2,2,2
1,2,1,4,9,2,1,4,30000,1,1,3,2,3,2,3,3,4
2,2,2,4,4,3,1,6,100000,3,2,3,2,2,4,5,4,4
3,2,2,4,7,5,2,6,65000,3,2,5,2,3,3,4,4,4
4,1,2,6,6,5,2,2,50000,2,2,3,2,3,3,3,3,3


In [57]:
df[ df['amount'] >= 1000000 ]

Unnamed: 0,gender,marriage,edu,job,mincome,aware,count,amount,decision,propensity,skin,promo,location,satisf_b,satisf_i,satisf_al,repurchase
16,1,2,8,3,2,5,3,1000000,1,3,1,2,2,3,3,3,3
17,2,1,3,8,5,1,6,1500000,3,3,2,4,1,4,4,4,4
53,2,2,4,3,3,14,1,2500000,1,2,1,2,2,4,4,4,4
54,1,2,4,4,5,4,2,1000000,3,2,1,1,1,3,3,3,3
90,2,2,6,3,6,1,4,1000000,3,3,2,2,1,5,1,4,4
166,2,2,8,3,3,1,5,1500000,3,3,5,2,1,5,4,4,4
193,1,2,6,3,6,2,1,5000000,1,3,5,4,1,3,4,4,4
246,2,2,6,10,1,1,10,1000000,3,2,3,1,3,2,3,3,3


In [58]:
df['edu'].value_counts() # edu 열에 대한 Counter

4    136
2     30
6     29
8     26
7     15
3      9
5      2
Name: edu, dtype: int64

In [59]:
df['edu'].value_counts(sort=False)

4    136
6     29
2     30
7     15
8     26
3      9
5      2
Name: edu, dtype: int64

In [60]:
from collections import Counter
counted: Counter = Counter(df['edu'].values) # Counter (dict)
series: pd.Series = pd.Series(data=counted.values(), index=counted.keys()) # pd.Series

series

4    136
6     29
2     30
7     15
8     26
3      9
5      2
dtype: int64

`map` 메서드 기능

In [61]:
def num_to_gender(val: int) -> str:
    if val == 1: return 'male'
    elif val == 2: return 'female'
    else: return '__nil__'

In [62]:
df['gender_str'] = df['gender'].apply(num_to_gender)
df['gender_str'].head()

0      male
1    female
2    female
3    female
4      male
Name: gender_str, dtype: object