##### Pandas
- 데이터 분석을 위한 사용이 쉽고 성능이 좋은 오픈소스 python 라이브러리
- `$ pip install pandas`
- 크게 두 가지의 데이터 타입
    - Series
        - Index와 Value로 이루어진 데이터 타입
    - DataFrame
        - Index와 Value와 Column으로 이루어진 데이터 타입
        - Column은 Series로 이루어져 있음
        - 엑셀의 테이블 형태로 구성이 되고, Column 별로 같은 데이터 타입을 가짐

In [1]:
import pandas as pd

In [3]:
import numpy as np

##### Series

In [4]:
# 0-9까지 랜덤한 5개의 데이터를 Series 생성
data = pd.Series(np.random.randint(10, size=(5)))
data
# 앞에가 Index. Column은 같은 데이터 타입

0    2
1    2
2    4
3    3
4    4
dtype: int32

In [6]:
# index 설정
data = pd.Series(np.random.randint(10, size=5), index=["A", "B", "C", "D", "E"])
data

A    4
B    2
C    3
D    3
E    4
dtype: int32

In [7]:
data.index, data.values

(Index(['A', 'B', 'C', 'D', 'E'], dtype='object'), array([4, 2, 3, 3, 4]))

In [8]:
# value 값 확인
data.A, data.D
#숫자로는 x. data.1하면 invalid syntax라고 나옴

(4, 3)

In [9]:
# series에 이름과 인덱스에 이름을 설정할 수 있음
data.name = "random_number"
data.index.name = "index_number"
data

index_number
A    4
B    2
C    3
D    3
E    4
Name: random_number, dtype: int32

In [10]:
data = pd.Series(np.random.randint(10, size=5), index=["A", "B", "C", "D", "E"])
data

A    8
B    4
C    2
D    4
E    1
dtype: int32

In [11]:
data*10

A    80
B    40
C    20
D    40
E    10
dtype: int32

In [12]:
data[["B","C","D"]]

B    4
C    2
D    4
dtype: int32

In [13]:
data[1]

4

In [14]:
data[1:]

B    4
C    2
D    4
E    1
dtype: int32

In [15]:
data[1::2]

B    4
D    4
dtype: int32

In [16]:
data[::-1]

E    1
D    4
C    2
B    4
A    8
dtype: int32

In [19]:
data > 5

A     True
B    False
C    False
D    False
E    False
dtype: bool

In [20]:
data[data > 5]

A    8
dtype: int32

In [23]:
# for문 사용 - list comprehension으로도 사용이 가능
# [idx,val for idx,val in data.item()]
data
for idx, val in data.items():
    print(idx, val)

A 8
B 4
C 2
D 4
E 1


In [25]:
# dictionary 데이터 타입의 데이터로 series 생성 가능
dic = {"D":3, "E":5, "F":7}
data2 = pd.Series(dic) #앞에 있는 키값으로 자동 sorting 됨
data2

D    3
E    5
F    7
dtype: int64

In [26]:
data

A    8
B    4
C    2
D    4
E    1
dtype: int32

In [27]:
data2

D    3
E    5
F    7
dtype: int64

In [29]:
result = data + data2
result
# 없는 데이터끼리는 더해줄수없어서 Nan. 연산하면 datatype이 float 형태로 바뀜

A    NaN
B    NaN
C    NaN
D    7.0
E    6.0
F    NaN
dtype: float64

In [30]:
# Nan 데이터 제거
result.notnull()
# 데이터가 있는 경우에는 True, 반대의 경우에는 False

A    False
B    False
C    False
D     True
E     True
F    False
dtype: bool

In [31]:
print(result.notnull())
result[result.notnull()]

A    False
B    False
C    False
D     True
E     True
F    False
dtype: bool


D    7.0
E    6.0
dtype: float64

##### Dataframe
- row(index), column(value)으로 이루어져 있음
- make
- insert
    - row
    - column
- append
- concat
- groupby, aggregate
- select
- merge

##### make


In [33]:
# 컬럼을 만들고 
df = pd.DataFrame(columns=["Email", "Name"])
df

Unnamed: 0,Email,Name


In [34]:
#컬럼에 리스트 데이터를 추가해서 만드는 방법
df["Name"] = ["fcamp", "dss"]
df["Email"] = ["fcamp@gmail.com", "dss@gmail.com"]
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss


In [35]:
df["Name"] # Series임. Series 여러 개가 모여서 데이터 프레임을 이루고 있는 것

0    fcamp
1      dss
Name: Name, dtype: object

In [36]:
df["Email"]

0    fcamp@gmail.com
1      dss@gmail.com
Name: Email, dtype: object

In [38]:
# 딕셔너리 데이터 타입을 Dataframe으로 만들기
name = ["fcamp", "dss"]
email = ["fcamp@gmail.com", "dss@gmail.com"]
dic = {"Name":name, "Email":email}
df = pd.DataFrame(dic)
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss


In [41]:
# 인덱스를 추가해서 만들기
index_list = ["one", "two"]
df = pd.DataFrame(dic, index=index_list)
df

Unnamed: 0,Email,Name
one,fcamp@gmail.com,fcamp
two,dss@gmail.com,dss


In [42]:
df.index, df.columns, df.values

(Index(['one', 'two'], dtype='object'),
 Index(['Email', 'Name'], dtype='object'),
 array([['fcamp@gmail.com', 'fcamp'],
        ['dss@gmail.com', 'dss']], dtype=object))

##### Insert
- row
- column

In [6]:
name = ["fcamp", "dss"]
email = ["fcamp@gmail.com", "dss@gmail.com"]
dic = {"Name":name, "Email":email}
df = pd.DataFrame(dic)
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss


In [7]:
df.loc[0]

Email    fcamp@gmail.com
Name               fcamp
Name: 0, dtype: object

In [8]:
df.loc[1]

Email    dss@gmail.com
Name               dss
Name: 1, dtype: object

In [9]:
# loc 지정해서 데이터를 넣는 방법
df.loc[2] = {"Email":"data@gmail.com", "Name":"data"}
df

Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss
2,data@gmail.com,data


In [10]:
len(df)

3

In [11]:
# loc 이용해서 항상 가장 마지막에 넣는 방법
print(len(df))
df.loc[len(df)] = {"Email":"data@gmail.com", "Name":"data"}
df

3


Unnamed: 0,Email,Name
0,fcamp@gmail.com,fcamp
1,dss@gmail.com,dss
2,data@gmail.com,data
3,data@gmail.com,data


In [12]:
# column
df["Address"] = ""
df

Unnamed: 0,Email,Name,Address
0,fcamp@gmail.com,fcamp,
1,dss@gmail.com,dss,
2,data@gmail.com,data,
3,data@gmail.com,data,


In [13]:
df["Address"] = "5"
df

Unnamed: 0,Email,Name,Address
0,fcamp@gmail.com,fcamp,5
1,dss@gmail.com,dss,5
2,data@gmail.com,data,5
3,data@gmail.com,data,5


In [14]:
df["Address"] = ["Seoul", "Busan", "Jeju", "Daegu"] #row와 데이터 갯수가 맞지 않으면 에러남
df

Unnamed: 0,Email,Name,Address
0,fcamp@gmail.com,fcamp,Seoul
1,dss@gmail.com,dss,Busan
2,data@gmail.com,data,Jeju
3,data@gmail.com,data,Daegu


##### apply

In [15]:
# apply
# 함수를 사용해서 함수의 리턴값이 데이터로 들어감
def count_char(name):
    return "{}({})".format(name, len(name))
#name의 len을 체크해서...
df["Name_Count"] = df["Name"].apply(count_char)
df

Unnamed: 0,Email,Name,Address,Name_Count
0,fcamp@gmail.com,fcamp,Seoul,fcamp(5)
1,dss@gmail.com,dss,Busan,dss(3)
2,data@gmail.com,data,Jeju,data(4)
3,data@gmail.com,data,Daegu,data(4)


In [16]:
df["Address_Count"] = df["Address"].apply(lambda addr:"{}({})".format(addr, len(addr)))
df

Unnamed: 0,Email,Name,Address,Name_Count,Address_Count
0,fcamp@gmail.com,fcamp,Seoul,fcamp(5),Seoul(5)
1,dss@gmail.com,dss,Busan,dss(3),Busan(5)
2,data@gmail.com,data,Jeju,data(4),Jeju(4)
3,data@gmail.com,data,Daegu,data(4),Daegu(5)


##### append

In [87]:
# append
# 사람의 이름과 나이가 들어간 데이터를 만들겨
import random, string

def get_name():
    names = ["Adam", "Alan", "Alex", "Alvin", "Andrew", "Anthony", "Arnold", "Jin", "Billy", "Anchal"]
    return random.choice(names)

get_name()

'Alvin'

In [90]:
def get_age(start=20, end=40):
    return random.randint(start, end)

get_age()

27

In [91]:
# list
def make_data(rows=10):
    datas = []
    for _ in range(rows):
        data = {"Age":get_age(), "Name":get_name()}
        datas.append(data)
    return datas

make_data()

[{'Age': 33, 'Name': 'Alex'},
 {'Age': 21, 'Name': 'Anchal'},
 {'Age': 33, 'Name': 'Anthony'},
 {'Age': 24, 'Name': 'Alex'},
 {'Age': 21, 'Name': 'Jin'},
 {'Age': 34, 'Name': 'Adam'},
 {'Age': 35, 'Name': 'Alan'},
 {'Age': 30, 'Name': 'Andrew'},
 {'Age': 28, 'Name': 'Jin'},
 {'Age': 26, 'Name': 'Alvin'}]

In [92]:
df1 = pd.DataFrame(make_data())
df1

Unnamed: 0,Age,Name
0,33,Alvin
1,37,Andrew
2,22,Adam
3,30,Alvin
4,37,Alvin
5,34,Alex
6,30,Jin
7,37,Anthony
8,28,Anchal
9,39,Alex


In [93]:
data2 = make_data()
df2 = pd.DataFrame(make_data())
df2

Unnamed: 0,Age,Name
0,24,Jin
1,31,Jin
2,24,Andrew
3,30,Andrew
4,29,Jin
5,22,Alex
6,23,Arnold
7,37,Billy
8,27,Andrew
9,37,Alex


In [95]:
#df1과 df2를 합치고 싶을 때 append를 이용할 수 있음
df3 = df1.append(df2)
df3

Unnamed: 0,Age,Name
0,33,Alvin
1,37,Andrew
2,22,Adam
3,30,Alvin
4,37,Alvin
5,34,Alex
6,30,Jin
7,37,Anthony
8,28,Anchal
9,39,Alex


In [96]:
# index 리셋하기
df3.reset_index()

Unnamed: 0,index,Age,Name
0,0,33,Alvin
1,1,37,Andrew
2,2,22,Adam
3,3,30,Alvin
4,4,37,Alvin
5,5,34,Alex
6,6,30,Jin
7,7,37,Anthony
8,8,28,Anchal
9,9,39,Alex


In [98]:
df4 = df3.reset_index(drop=True)
df4

Unnamed: 0,Age,Name
0,33,Alvin
1,37,Andrew
2,22,Adam
3,30,Alvin
4,37,Alvin
5,34,Alex
6,30,Jin
7,37,Anthony
8,28,Anchal
9,39,Alex


In [100]:
#drop(True) - 새롭게 생성되는 인덱스 털럼을 삭제
#inplace(True) - 함수를 사용하는 객체 자체 인덱스를 리셋함
# + inplace=True는 수정된 데이터가 해당 변수에 바로 적용된다
# inplace=True를 사용하지 않으면 결과 데이터를 받아서 저장해야함
df3.reset_index(drop=True, inplace=True)
df3

Unnamed: 0,Age,Name
0,33,Alvin
1,37,Andrew
2,22,Adam
3,30,Alvin
4,37,Alvin
5,34,Alex
6,30,Jin
7,37,Anthony
8,28,Anchal
9,39,Alex


In [101]:
# append를 할 때 인덱스를 리셋
df3 = df1.append(df2, ignore_index=True)
df3

Unnamed: 0,Age,Name
0,33,Alvin
1,37,Andrew
2,22,Adam
3,30,Alvin
4,37,Alvin
5,34,Alex
6,30,Jin
7,37,Anthony
8,28,Anchal
9,39,Alex


##### concat
- rows
- columns

In [102]:
# concat rows
df1

Unnamed: 0,Age,Name
0,33,Alvin
1,37,Andrew
2,22,Adam
3,30,Alvin
4,37,Alvin
5,34,Alex
6,30,Jin
7,37,Anthony
8,28,Anchal
9,39,Alex


In [103]:
df2

Unnamed: 0,Age,Name
0,24,Jin
1,31,Jin
2,24,Andrew
3,30,Andrew
4,29,Jin
5,22,Alex
6,23,Arnold
7,37,Billy
8,27,Andrew
9,37,Alex


In [105]:
df3 = pd.concat([df1,df2])
df3

Unnamed: 0,Age,Name
0,33,Alvin
1,37,Andrew
2,22,Adam
3,30,Alvin
4,37,Alvin
5,34,Alex
6,30,Jin
7,37,Anthony
8,28,Anchal
9,39,Alex


In [106]:
df3 = pd.concat([df1,df2]).reset_index(drop=True) # 체이닝 가능
df3

Unnamed: 0,Age,Name
0,33,Alvin
1,37,Andrew
2,22,Adam
3,30,Alvin
4,37,Alvin
5,34,Alex
6,30,Jin
7,37,Anthony
8,28,Anchal
9,39,Alex


In [107]:
# concat columns
# axis
pd.concat([df3, df1], axis=1)

Unnamed: 0,Age,Name,Age.1,Name.1
0,33,Alvin,33.0,Alvin
1,37,Andrew,37.0,Andrew
2,22,Adam,22.0,Adam
3,30,Alvin,30.0,Alvin
4,37,Alvin,37.0,Alvin
5,34,Alex,34.0,Alex
6,30,Jin,30.0,Jin
7,37,Anthony,37.0,Anthony
8,28,Anchal,28.0,Anchal
9,39,Alex,39.0,Alex


In [108]:
df4 = pd.concat([df3, df1], axis=1, join='inner')
df4

Unnamed: 0,Age,Name,Age.1,Name.1
0,33,Alvin,33,Alvin
1,37,Andrew,37,Andrew
2,22,Adam,22,Adam
3,30,Alvin,30,Alvin
4,37,Alvin,37,Alvin
5,34,Alex,34,Alex
6,30,Jin,30,Jin
7,37,Anthony,37,Anthony
8,28,Anchal,28,Anchal
9,39,Alex,39,Alex


##### Group by
- 이름별 평균 나이를 나타내는 데이터 프레임을 만들 거

In [109]:
# 20명에 대한 이름과 나이를 나타내는 데이터 프레임을 만들어
g_df = pd.DataFrame(make_data(20))
g_df

Unnamed: 0,Age,Name
0,28,Anchal
1,33,Alvin
2,20,Andrew
3,23,Arnold
4,35,Anchal
5,28,Anthony
6,22,Alex
7,35,Anchal
8,25,Anthony
9,30,Anchal


In [110]:
g_df.tail()

Unnamed: 0,Age,Name
15,24,Andrew
16,26,Alvin
17,36,Arnold
18,31,Andrew
19,34,Billy


In [111]:
# 이름을 unique로 출력
g_df["Name"]

0      Anchal
1       Alvin
2      Andrew
3      Arnold
4      Anchal
5     Anthony
6        Alex
7      Anchal
8     Anthony
9      Anchal
10      Billy
11      Billy
12     Anchal
13      Billy
14       Alex
15     Andrew
16      Alvin
17     Arnold
18     Andrew
19      Billy
Name: Name, dtype: object

In [112]:
g_df["Name"].values

array(['Anchal', 'Alvin', 'Andrew', 'Arnold', 'Anchal', 'Anthony', 'Alex',
       'Anchal', 'Anthony', 'Anchal', 'Billy', 'Billy', 'Anchal', 'Billy',
       'Alex', 'Andrew', 'Alvin', 'Arnold', 'Andrew', 'Billy'],
      dtype=object)

In [113]:
list(set(g_df["Name"].values))

['Alvin', 'Anthony', 'Andrew', 'Anchal', 'Arnold', 'Alex', 'Billy']

In [114]:
result1 = np.array(list(set(g_df["Name"].values)))
len(result1), result1

(7, array(['Alvin', 'Anthony', 'Andrew', 'Anchal', 'Arnold', 'Alex', 'Billy'],
       dtype='<U7'))

In [118]:
# pandas의 unique를 이용하여 유니크한 일므을 출력
result2 = g_df["Name"].unique()
len(result2), result2

(7, array(['Anchal', 'Alvin', 'Andrew', 'Arnold', 'Anthony', 'Alex', 'Billy'],
       dtype=object))

In [121]:
# group by(column명)
g_df.groupby("Name").size()

Name
Alex       2
Alvin      2
Anchal     5
Andrew     3
Anthony    2
Arnold     2
Billy      4
dtype: int64

In [122]:
result_df = g_df.groupby("Name").size().reset_index(name="counts")
result_df

Unnamed: 0,Name,counts
0,Alex,2
1,Alvin,2
2,Anchal,5
3,Andrew,3
4,Anthony,2
5,Arnold,2
6,Billy,4


In [128]:
# sort values
result_df.sort_values(by=["counts"], ascending=False) #descending 없어서..

Unnamed: 0,Name,counts
2,Anchal,5
6,Billy,4
3,Andrew,3
0,Alex,2
1,Alvin,2
4,Anthony,2
5,Arnold,2


In [131]:
result_df.reset_index(drop=True, inplace=True)
result_df
###???????????

Unnamed: 0,Name,counts
0,Alex,2
1,Alvin,2
2,Anchal,5
3,Andrew,3
4,Anthony,2
5,Arnold,2
6,Billy,4


In [132]:
# agg: min
# 나이가 제일 어린 나이로 name 그루핑합니다.
g_df.groupby("Name").agg("min")

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Alex,22
Alvin,26
Anchal,28
Andrew,20
Anthony,25
Arnold,23
Billy,28


In [133]:
g_df.groupby("Name").agg("min").reset_index()

Unnamed: 0,Name,Age
0,Alex,22
1,Alvin,26
2,Anchal,28
3,Andrew,20
4,Anthony,25
5,Arnold,23
6,Billy,28


In [134]:
# 가장 나이가 많은 이름으로 그룹핑
g_df.groupby("Name").agg("max").reset_index()

Unnamed: 0,Name,Age
0,Alex,32
1,Alvin,33
2,Anchal,35
3,Andrew,31
4,Anthony,28
5,Arnold,36
6,Billy,37


In [135]:
# agg: mean
g_df.groupby("Name").agg("mean").reset_index()

Unnamed: 0,Name,Age
0,Alex,27.0
1,Alvin,29.5
2,Anchal,32.4
3,Andrew,25.0
4,Anthony,26.5
5,Arnold,29.5
6,Billy,33.0


In [137]:
g_df.groupby("Name").agg("sum").reset_index()

Unnamed: 0,Name,Age
0,Alex,54
1,Alvin,59
2,Anchal,162
3,Andrew,75
4,Anthony,53
5,Arnold,59
6,Billy,132


In [138]:
g_df.groupby("Name").agg("median").reset_index()

Unnamed: 0,Name,Age
0,Alex,27.0
1,Alvin,29.5
2,Anchal,34.0
3,Andrew,24.0
4,Anthony,26.5
5,Arnold,29.5
6,Billy,33.5


In [140]:
# agg로 여러 개 컬링 생성
df = g_df.groupby("Name").agg(["min","max","mean"]).reset_index()
df

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Alex,22,32,27.0
1,Alvin,26,33,29.5
2,Anchal,28,35,32.4
3,Andrew,20,31,25.0
4,Anthony,25,28,26.5
5,Arnold,23,36,29.5
6,Billy,28,37,33.0


In [None]:
# select

In [141]:
df.head()

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Alex,22,32,27.0
1,Alvin,26,33,29.5
2,Anchal,28,35,32.4
3,Andrew,20,31,25.0
4,Anthony,25,28,26.5


In [143]:
df.tail(3)

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
4,Anthony,25,28,26.5
5,Arnold,23,36,29.5
6,Billy,28,37,33.0


In [144]:
df.tail(n=7)

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Alex,22,32,27.0
1,Alvin,26,33,29.5
2,Anchal,28,35,32.4
3,Andrew,20,31,25.0
4,Anthony,25,28,26.5
5,Arnold,23,36,29.5
6,Billy,28,37,33.0


In [145]:
df[3:6]

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
3,Andrew,20,31,25.0
4,Anthony,25,28,26.5
5,Arnold,23,36,29.5


In [146]:
df[3:]

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
3,Andrew,20,31,25.0
4,Anthony,25,28,26.5
5,Arnold,23,36,29.5
6,Billy,28,37,33.0


In [None]:
# 두개 이상 써줘야 함. df[3]하면 에러남

In [150]:
df

Unnamed: 0_level_0,Name,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean
0,Alex,22,32,27.0
1,Alvin,26,33,29.5
2,Anchal,28,35,32.4
3,Andrew,20,31,25.0
4,Anthony,25,28,26.5
5,Arnold,23,36,29.5
6,Billy,28,37,33.0


In [147]:
df.loc[3]

Name          Andrew
Age   min         20
      max         31
      mean        25
Name: 3, dtype: object

In [148]:
df.loc[2]["Age"]["min"]

28

In [149]:
df.loc[3]["Name"][""]

'Andrew'

In [152]:
data = {
    "Name":df["Name"],
    "Min":df["Age"]["min"],
    "Max":df["Age"]["max"],
    "Mean":df["Age"]["mean"],
    
}
n_df = pd.DataFrame(data)
n_df
Nmae의 순서는 오름차순으로 정렬됨.순서는 보장안됨

Unnamed: 0,Max,Mean,Min,Name
0,32,27.0,22,Alex
1,33,29.5,26,Alvin
2,35,32.4,28,Anchal
3,31,25.0,20,Andrew
4,28,26.5,25,Anthony
5,36,29.5,23,Arnold
6,37,33.0,28,Billy


In [153]:
n_df[n_df["Mean"]>30]

Unnamed: 0,Max,Mean,Min,Name
2,35,32.4,28,Anchal
6,37,33.0,28,Billy


In [155]:
n_df[n_df["Mean"]>30].sort_values(by=["Mean"], ascending=False)

Unnamed: 0,Max,Mean,Min,Name
6,37,33.0,28,Billy
2,35,32.4,28,Anchal


In [156]:
# 평균나이가 30세 이상인 데이터를 내림차순으로 정렬하고 인덱스를 재설정
n_df[n_df["Mean"]>30].sort_values(by=["Mean"], ascending=False).reset_index(drop=True)

Unnamed: 0,Max,Mean,Min,Name
0,37,33.0,28,Billy
1,35,32.4,28,Anchal


In [157]:
g_df

Unnamed: 0,Age,Name
0,28,Anchal
1,33,Alvin
2,20,Andrew
3,23,Arnold
4,35,Anchal
5,28,Anthony
6,22,Alex
7,35,Anchal
8,25,Anthony
9,30,Anchal


In [159]:
list(g_df.groupby("Name").size())

[2, 2, 5, 3, 2, 2, 4]

In [160]:
n_df["Count"] = list(g_df.groupby("Name").size())
n_df

Unnamed: 0,Max,Mean,Min,Name,Count
0,32,27.0,22,Alex,2
1,33,29.5,26,Alvin,2
2,35,32.4,28,Anchal,5
3,31,25.0,20,Andrew,3
4,28,26.5,25,Anthony,2
5,36,29.5,23,Arnold,2
6,37,33.0,28,Billy,4


In [161]:
# drop - mean 데이터를 가장 뒤로 이동시키겠어
mean = n_df["Mean"] #옮기고자하는 데이터를 어떤 변수에 저장하고 삭제했다가 맨뒤에 column으로 붙이기
mean

0    27.0
1    29.5
2    32.4
3    25.0
4    26.5
5    29.5
6    33.0
Name: Mean, dtype: float64

In [162]:
n_df.drop("Mean", axis=1, inplace=True) 
n_df

Unnamed: 0,Max,Min,Name,Count
0,32,22,Alex,2
1,33,26,Alvin,2
2,35,28,Anchal,5
3,31,20,Andrew,3
4,28,25,Anthony,2
5,36,23,Arnold,2
6,37,28,Billy,4


In [164]:
n_df["Mean"] = mean

In [166]:
n_df

Unnamed: 0,Max,Min,Name,Count,Mean
0,32,22,Alex,2,27.0
1,33,26,Alvin,2,29.5
2,35,28,Anchal,5,32.4
3,31,20,Andrew,3,25.0
4,28,25,Anthony,2,26.5
5,36,23,Arnold,2,29.5
6,37,28,Billy,4,33.0


In [167]:
# rename column
n_df.rename(columns={"Max":"Maximum","Name":"Unique_Name"})

Unnamed: 0,Maximum,Min,Unique_Name,Count,Mean
0,32,22,Alex,2,27.0
1,33,26,Alvin,2,29.5
2,35,28,Anchal,5,32.4
3,31,20,Andrew,3,25.0
4,28,25,Anthony,2,26.5
5,36,23,Arnold,2,29.5
6,37,28,Billy,4,33.0


##### Merge - sql에서의 joing과 같은 거
- user_df = 아이디, 이름, 나이, 데이터 프레임 생성
- money_df = 아이디, 돈 데이터 프레임을 생성

In [169]:
user_df = pd.DataFrame(columns=["UserID", "Name", "Age"])
for idx in range(1,9):
    name = get_name()
    
    # 중복 이름 제거
    while name in list(user_df["Name"]):
        name = get_name()
        
    # 데이터 name_df insert
    data = {"Name": name, "UserID": idx, "Age":get_age()}
    user_df.loc[len(user_df)] = data
    
user_df

Unnamed: 0,UserID,Name,Age
0,1,Alan,22
1,2,Jin,30
2,3,Adam,39
3,4,Billy,32
4,5,Alex,39
5,6,Arnold,25
6,7,Andrew,23
7,8,Alvin,31


In [171]:
money_df = pd.DataFrame(columns=["ID", "Money"])

for idx in range(15):
    money = random.randint(1,20) * 1000
    data = {"Money": money, "ID":random.randint(1,8)} #randing는 마지막 숫자 포함
    money_df.loc[len(money_df)] = data

money_df

Unnamed: 0,ID,Money
0,1,8000
1,7,7000
2,1,19000
3,5,14000
4,5,11000
5,5,3000
6,6,7000
7,7,3000
8,7,4000
9,1,14000


In [179]:
# merge - user_df, money_df - key:ID, UserID
# money 데이터 기준으로 merge
money_df.merge(user_df, left_on="ID", right_on="Money")
###?????

KeyError: 'Money'

In [174]:
# user 데이터 기준으로 merge
user_df.merge(money_df, left_on="ID", right_on="UserID")
##??

KeyError: 'UserID'

In [175]:
user_df.rename(columns={"UserID":"ID"},inplace=True)
user_df

Unnamed: 0,ID,Name,Age
0,1,Alan,22
1,2,Jin,30
2,3,Adam,39
3,4,Billy,32
4,5,Alex,39
5,6,Arnold,25
6,7,Andrew,23
7,8,Alvin,31


In [176]:
result_df = pd.merge(money_df,user_df)
result_df

Unnamed: 0,ID,Money,Name,Age
0,1,8000,Alan,22
1,1,19000,Alan,22
2,1,14000,Alan,22
3,7,7000,Andrew,23
4,7,3000,Andrew,23
5,7,4000,Andrew,23
6,7,8000,Andrew,23
7,5,14000,Alex,39
8,5,11000,Alex,39
9,5,3000,Alex,39


In [178]:
money_list = result_df.groupby("Name").sum()["Money"].reset_index()
money_list # 왜 줄음? 특정사람에 대한 money 정보가 없어서 그럼. inner join! 중복된 것만 뽑아내고 안 겹치는 건 빼 버림

Unnamed: 0,Name,Money
0,Alan,41000
1,Alex,34000
2,Alvin,15000
3,Andrew,22000
4,Arnold,11000
5,Billy,17000


In [181]:
result_df[result_df["Age"] == 23]

Unnamed: 0,ID,Money,Name,Age
3,7,7000,Andrew,23
4,7,3000,Andrew,23
5,7,4000,Andrew,23
6,7,8000,Andrew,23


In [182]:
money_list

Unnamed: 0,Name,Money
0,Alan,41000
1,Alex,34000
2,Alvin,15000
3,Andrew,22000
4,Arnold,11000
5,Billy,17000


In [183]:
# merge - outer
result = pd.merge(user_df, money_list, how='outer')
result

Unnamed: 0,ID,Name,Age,Money
0,1,Alan,22,41000.0
1,2,Jin,30,
2,3,Adam,39,
3,4,Billy,32,17000.0
4,5,Alex,39,34000.0
5,6,Arnold,25,11000.0
6,7,Andrew,23,22000.0
7,8,Alvin,31,15000.0


In [185]:
#fillna - NaN을 특정 데이터로 채워줌
result = pd.merge(user_df, money_list, how='outer').fillna(value=0)
result

Unnamed: 0,ID,Name,Age,Money
0,1,Alan,22,41000.0
1,2,Jin,30,0.0
2,3,Adam,39,0.0
3,4,Billy,32,17000.0
4,5,Alex,39,34000.0
5,6,Arnold,25,11000.0
6,7,Andrew,23,22000.0
7,8,Alvin,31,15000.0


In [186]:
# change data type
result["Money"]


0    41000.0
1        0.0
2        0.0
3    17000.0
4    34000.0
5    11000.0
6    22000.0
7    15000.0
Name: Money, dtype: float64

In [187]:
result["Money"].astype("int")

0    41000
1        0
2        0
3    17000
4    34000
5    11000
6    22000
7    15000
Name: Money, dtype: int32

In [189]:
result["Money"] = result["Money"].astype("int")
result

Unnamed: 0,ID,Name,Age,Money
0,1,Alan,22,41000
1,2,Jin,30,0
2,3,Adam,39,0
3,4,Billy,32,17000
4,5,Alex,39,34000
5,6,Arnold,25,11000
6,7,Andrew,23,22000
7,8,Alvin,31,15000


##### Dataframe Input / Output
- csv, excel
- `$ pip3 install xlrd`
- `$ pip3 install openyx1`

In [191]:
result

Unnamed: 0,ID,Name,Age,Money
0,1,Alan,22,41000
1,2,Jin,30,0
2,3,Adam,39,0
3,4,Billy,32,17000
4,5,Alex,39,34000
5,6,Arnold,25,11000
6,7,Andrew,23,22000
7,8,Alvin,31,15000


In [192]:
# save cs
result.to_csv('foo.csv', index=False) #index를 같이 저장하고 싶으면 True로 저장하면 됨. 만약 날짜 데이터가 index값이면 index값 True로 해서 저장해야지

In [17]:
# load csv
df = pd.read_csv('foo.csv')
df

Unnamed: 0,ID,Name,Age,Money
0,1,Alan,22,41000
1,2,Jin,30,0
2,3,Adam,39,0
3,4,Billy,32,17000
4,5,Alex,39,34000
5,6,Arnold,25,11000
6,7,Andrew,23,22000
7,8,Alvin,31,15000


In [18]:
!pwd

/c/Users/Administrator/Desktop/�뜲�씠�꽣 �궗�씠�뼵�뒪 �뒪荑�/dss8


In [19]:
# excel 은 저장 되는 인코딩 타입을 확인해야 합니다.(utf-8을 사용하지 않습니다.)
df.to_excel('../ttt/foo.xlsx', sheet_name='Sheet1')

FileNotFoundError: [Errno 2] No such file or directory: '../ttt/foo.xlsx'

In [None]:
path = "/Users/rada/Documents/fastcampus/dss8/02_numpy_pandas/B/test/"
df.to_excel(path+'foo.xlsx', sheet_name='Sheet1')

In [20]:
# load excel
df = pd.read_excel('foo.xlsx', 'Sheet1')
df

Unnamed: 0,ID,Name,Age,Money
0,1,Alan,22,41000
1,2,Jin,30,0
2,3,Adam,39,0
3,4,Billy,32,17000
4,5,Alex,39,34000
5,6,Arnold,25,11000
6,7,Andrew,23,22000
7,8,Alvin,31,15000
