# Pandas 

- 구조화된 데이터 처리를 지원하는 라이브러리
- 고성능 Array 계산 라이브러리인 Numpy와 통합하여, 강력한 "스프레드시트"처리 기능을 제공함


## 데이터 불러오기 확인하기

In [1]:
import pandas as pd # 라이브러리 호출 

In [2]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' #Data URL
# data_url = './housing.data' #Data URL
df_data = pd.read_csv(data_url, sep='\s+', header = None) #csv 타입 데이터 로드, separate는 빈공간으로 지정하고, Column은 없음

In [4]:
df_data.head() #  처음 다섯줄 출력

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [5]:
df_data.columns = [
    'CRIM','ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO' ,'B', 'LSTAT', 'MEDV'] 
# Column Header 이름 지정
df_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [7]:
df_data.values # Numpy 형식으로 뽑아줌 
type(df_data.values)

numpy.ndarray

## 시리즈 개념

- Series는 Column Vector를 표현하는 object 

In [10]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as pd 

In [13]:
list_data = [1,2,3,4,5] # dict도 가능하다

example_obj = Series(data = list_data)
example_obj

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [14]:
# index 이름도 지정 가능 

list_data = [1,2,3,4,5]
list_name = ["a","b","c","d","e"]
example_obj = Series(data = list_data, index=list_name) 
example_obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [15]:
# 혹은 dict type으로 인덱스 - 값 지정 가능 

dict_data = {"a":1, "b":2, "c":3, "d":4, "e":5}
example_obj = Series(dict_data, dtype=np.float32, name="example_data")
example_obj

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [16]:
# index 사용해서 값 찾기 
example_obj["a"] 

1.0

In [18]:
# data index에 값 할당하기
example_obj["a"] = 3.2
example_obj

a    3.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [19]:
# 보다 큰 값 찾기 
example_obj[example_obj > 2] 

a    3.2
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [20]:
# 곱하면 전체 값이 곱해진다
example_obj * 2 

a     6.4
b     4.0
c     6.0
d     8.0
e    10.0
Name: example_data, dtype: float32

## DataFrame

In [21]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [22]:
# Example from - https://chrisalbon.com/python/pandas_map_values_to_values.html
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
        'age': [42, 52, 36, 24, 73],
        'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'city'])
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [25]:
# 특정 컬럼의 데이터 출력 
DataFrame(raw_data, columns = ["age","city"])

Unnamed: 0,age,city
0,42,San Francisco
1,52,Baltimore
2,36,Miami
3,24,Douglas
4,73,Boston


In [37]:
# 새로운 컬럼 추가 
df = DataFrame(raw_data, 
          columns = ["first_name","last_name","age", "city", "debt"]
         )

In [29]:
# 특정 컬럼 값 가져오기 

df.first_name
df['first_name']

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [30]:
# loc(= location)
# index의 이름을 가지고 특정 위치에 접근할 수 있다

df.loc[1]

first_name        Molly
last_name      Jacobson
age                  52
city          Baltimore
Name: 1, dtype: object

In [32]:
# iloc = index position 
# index number을 가지고 접근 
df["age"].iloc[1:]

1    52
2    36
3    24
4    73
Name: age, dtype: int64

In [33]:
# Example from - https://stackoverflow.com/questions/31593201/pandas-iloc-vs-ix-vs-loc-explanation
s = pd.Series(np.nan, index=[49,48,47,46,45, 1, 2, 3, 4, 5])
s

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: float64

In [34]:
s.loc[:3]

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

In [35]:
s.iloc[:3]

49   NaN
48   NaN
47   NaN
dtype: float64

In [38]:
# column에 새로운 데이터 할당 

df.debt = df.age > 40 # 조건문을 활용해서 새로운 feature을 만들 수 있다 
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,True
1,Molly,Jacobson,52,Baltimore,True
2,Tina,Ali,36,Miami,False
3,Jake,Milner,24,Douglas,False
4,Amy,Cooze,73,Boston,True


In [39]:
df.T # transpose 

Unnamed: 0,0,1,2,3,4
first_name,Jason,Molly,Tina,Jake,Amy
last_name,Miller,Jacobson,Ali,Milner,Cooze
age,42,52,36,24,73
city,San Francisco,Baltimore,Miami,Douglas,Boston
debt,True,True,False,False,True


In [40]:
df.values # 값 출력 

array([['Jason', 'Miller', 42, 'San Francisco', True],
       ['Molly', 'Jacobson', 52, 'Baltimore', True],
       ['Tina', 'Ali', 36, 'Miami', False],
       ['Jake', 'Milner', 24, 'Douglas', False],
       ['Amy', 'Cooze', 73, 'Boston', True]], dtype=object)

In [41]:
df.to_csv() # csv 변환

',first_name,last_name,age,city,debt\r\n0,Jason,Miller,42,San Francisco,True\r\n1,Molly,Jacobson,52,Baltimore,True\r\n2,Tina,Ali,36,Miami,False\r\n3,Jake,Milner,24,Douglas,False\r\n4,Amy,Cooze,73,Boston,True\r\n'

In [42]:
# 컬럼 삭제하기

del df["debt"]

## Selection & Drop 

In [43]:
# selection with column name 

In [46]:
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [47]:
# 여러 컬럼 데이터 출력 

df[['first_name','last_name']].head(5)

Unnamed: 0,first_name,last_name
0,Jason,Miller
1,Molly,Jacobson
2,Tina,Ali
3,Jake,Milner
4,Amy,Cooze


In [49]:
# column 이름 없이 사용하는 index number는 row 기준 표시
df[:3]

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami


In [52]:
# column 이름과 함께 row_index 사용 시 , 해당 column만 가져올 수 있음
df["first_name"][:3]

0    Jason
1    Molly
2     Tina
Name: first_name, dtype: object