In [7]:
import pandas as pd

data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
df_data = pd.read_csv(data_url, sep = '\s+', header = None)
# 기존 데이터를 불러와 data frame 생성
df_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [8]:
type(df_data.values)

numpy.ndarray

### Pandas의 구성
- Series: DataFrame 중 하나의 Column에 해당하는 데이터의 모음 object
- Dataframe: Data Table 전체를 포함하는 object

In [9]:
from pandas import Series, DataFrame
import numpy as np

In [11]:
list_data =[1,2,3,4,5]
example_obj = Series(data = list_data)
example_obj

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [14]:
dict_data = {"a":1,"b":2,"c":3,"d":4,"e":5}
#Series(data, data type 설정, series 이름 설정)
example_obj = Series(dict_data ,dtype = np.float32, name = "example_data")
example_obj

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [18]:
raw_data = {"first_name":['Jason','Moly','Tina','Jake','Amy'],
           "last_name":['Miller','Jacobson','Ali','Milner','Cooze'],
           "age":[42,52,36,24,73],
           "city":['San Francisco', 'Baltimore', 'Miami','Doouglas','Boston']}
df = pd.DataFrame(raw_data, columns = ['first_name','last_name','age','city',"debt"])
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Moly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Doouglas,
4,Amy,Cooze,73,Boston,


In [19]:
df.first_name #column 선택 - series 추출

0    Jason
1     Moly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [20]:
df["first_name"] #column 선택 - series 추출

0    Jason
1     Moly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

### loc과 iloc의 차이

In [28]:
df.loc[1] #index location

first_name         Moly
last_name      Jacobson
age                  52
city          Baltimore
debt                NaN
Name: 1, dtype: object

In [83]:
df["age"].iloc[1:] #index position, series

1       62
2       33
3       95
4       43
5       30
        ..
1374    33
1375    86
1376    37
1377    54
1378    31
Name: age, Length: 1378, dtype: int64

In [29]:
s = pd.Series(np.nan,index = [49,48,47,46,45,1,2,3,4,5])
s

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: float64

In [30]:
s.loc[:3]

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

In [31]:
s.iloc[:3]

49   NaN
48   NaN
47   NaN
dtype: float64

### DataFrame 몇 가지 기능들
- Column에 새로운 데이터 할당

In [32]:
df.debt = df.age>40
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,True
1,Moly,Jacobson,52,Baltimore,True
2,Tina,Ali,36,Miami,False
3,Jake,Milner,24,Doouglas,False
4,Amy,Cooze,73,Boston,True


In [33]:
df.T #transpose

Unnamed: 0,0,1,2,3,4
first_name,Jason,Moly,Tina,Jake,Amy
last_name,Miller,Jacobson,Ali,Milner,Cooze
age,42,52,36,24,73
city,San Francisco,Baltimore,Miami,Doouglas,Boston
debt,True,True,False,False,True


In [34]:
df.values #값 출력

array([['Jason', 'Miller', 42, 'San Francisco', True],
       ['Moly', 'Jacobson', 52, 'Baltimore', True],
       ['Tina', 'Ali', 36, 'Miami', False],
       ['Jake', 'Milner', 24, 'Doouglas', False],
       ['Amy', 'Cooze', 73, 'Boston', True]], dtype=object)

In [35]:
df.to_csv() #csv로 변환

',first_name,last_name,age,city,debt\r\n0,Jason,Miller,42,San Francisco,True\r\n1,Moly,Jacobson,52,Baltimore,True\r\n2,Tina,Ali,36,Miami,False\r\n3,Jake,Milner,24,Doouglas,False\r\n4,Amy,Cooze,73,Boston,True\r\n'

In [36]:
del df["debt"] #column 삭제
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Moly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Doouglas
4,Amy,Cooze,73,Boston


### Selection with column names

In [40]:
df[["first_name","last_name"]].head(3)

Unnamed: 0,first_name,last_name
0,Jason,Miller
1,Moly,Jacobson
2,Tina,Ali


In [41]:
df[:3]

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Moly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami


In [43]:
series = df["age"]
series[series < 50] #Boolean index

0    42
2    36
3    24
Name: age, dtype: int64

In [46]:
df.loc[[0,3],["first_name","age"]] #column과 index name

Unnamed: 0,first_name,age
0,Jason,42
3,Jake,24


In [47]:
df.iloc[:2,:2] #column number와 index number

Unnamed: 0,first_name,last_name
0,Jason,Miller
1,Moly,Jacobson


In [50]:
df.drop(1) #index number로 drop

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Doouglas
4,Amy,Cooze,73,Boston


In [51]:
df.drop("age", axis = 1) #axis 지정으로 축을 기준으로 drop 
                         #inplace: 원본 handling

Unnamed: 0,first_name,last_name,city
0,Jason,Miller,San Francisco
1,Moly,Jacobson,Baltimore
2,Tina,Ali,Miami
3,Jake,Milner,Doouglas
4,Amy,Cooze,Boston


### Series Operation

In [61]:
s1 = Series(range(1,6), index=list("abced"))
s2 = Series(range(5,11),index=list("bcedef"))

In [62]:
s1.add(s2) # = s1 + s2
           # s1.add(s2, fill_value = 0): fill value=0은 NaN 값을 0으로 변환

a     NaN
b     7.0
c     9.0
d    13.0
e    11.0
e    13.0
f     NaN
dtype: float64

### Lambda & map
- Pandas의 series type의 데이터에도 map 함수 사용 가능
- function 대신 dict, sequence형 자료등으로 대체 가능

In [64]:
s1 = Series(np.arange(10))
s1.map(lambda x: x**2).head(5)

0     0
1     1
2     4
3     9
4    16
dtype: int64

In [66]:
df = pd.read_csv("https://raw.githubusercontent.com/rstudio/Intro/master/data/wages.csv")
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


In [67]:
df.sex.unique()

array(['male', 'female'], dtype=object)

In [71]:
df["sex_code"] =  df.sex.map({"male":0, "female":1})
df.head()

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.299011,73.89,male,white,16,49,0
1,96396.988643,66.23,female,white,16,62,1
2,48710.666947,63.77,female,white,16,33,1
3,80478.096153,63.22,female,other,16,95,1
4,82089.345498,63.08,female,white,17,43,1


### Apply for dataframe 
- map과 달리, series 전체 column에 해당 함수를 적용
- 입력값이 series 데이터로 입력 받아 handling 가능

In [69]:
df_info = df[["earn","height","age"]]
df_info.head()

Unnamed: 0,earn,height,age
0,79571.299011,73.89,49
1,96396.988643,66.23,62
2,48710.666947,63.77,33
3,80478.096153,63.22,95
4,82089.345498,63.08,43


In [70]:
f = lambda x: x.max()-x.min()
df_info.apply(f)

earn      318047.708444
height        19.870000
age           73.000000
dtype: float64

### Applymap for dataframe
- series 단위가 아닌 element 단위로 함수를 적용
- series 단위에 apply를 적용시킬 때와 같은 효과

In [73]:
f = lambda x: -x
df_info.applymap(f).head(5)

Unnamed: 0,earn,height,age
0,-79571.299011,-73.89,-49
1,-96396.988643,-66.23,-62
2,-48710.666947,-63.77,-33
3,-80478.096153,-63.22,-95
4,-82089.345498,-63.08,-43


In [74]:
df_info["earn"].apply(f).head(5)

0   -79571.299011
1   -96396.988643
2   -48710.666947
3   -80478.096153
4   -82089.345498
Name: earn, dtype: float64

### Describe
- Numeric type 데이터의 요약 정보를 보여줌

In [75]:
df.describe()

Unnamed: 0,earn,height,ed,age,sex_code
count,1379.0,1379.0,1379.0,1379.0,1379.0
mean,32446.292622,66.59264,13.354605,45.328499,0.622915
std,31257.070006,3.818108,2.438741,15.789715,0.484832
min,-98.580489,57.34,3.0,22.0,0.0
25%,10538.790721,63.72,12.0,33.0,0.0
50%,26877.870178,66.05,13.0,42.0,1.0
75%,44506.215336,69.315,15.0,55.0,1.0
max,317949.127955,77.21,18.0,95.0,1.0


### Unique
- series data의 유일한 값을 list를 반환

In [76]:
df.race.unique()

array(['white', 'other', 'hispanic', 'black'], dtype=object)

In [77]:
np.array(dict(enumerate(df["race"].unique())))

array({0: 'white', 1: 'other', 2: 'hispanic', 3: 'black'}, dtype=object)

In [80]:
value = list(map(int, np.array(list(enumerate(df["race"].unique())))[:,0].tolist()))
key = np.array(list(enumerate(df["race"].unique())), dtype=str)[:,1].tolist()
value, key #label index값과 label의 값들 각각 추출

([0, 1, 2, 3], ['white', 'other', 'hispanic', 'black'])