# Pandas

In [1]:
import pandas as pd

In [3]:
list_tmp = [1,2,3]

In [4]:
list_tmp

[1, 2, 3]

## Series

In [5]:
s1 = pd.core.series.Series(list_tmp)

In [6]:
s2 = pd.core.series.Series(['one', 'two', 'three'])

In [7]:
pd.DataFrame(data=dict(num=s1, word=s2))

Unnamed: 0,num,word
0,1,one
1,2,two
2,3,three


## File I/O

In [8]:
#check pandas version
!conda list | grep pandas

pandas                    1.1.3            py38hb1e8313_0  


In [9]:
import pandas as pd

In [18]:
df = pd.read_csv('data/student.csv')

In [19]:
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [20]:
#처음부터 n개의 값을 보여줌 default = 5
df.head()

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager


In [21]:
#  끝에서 n개의 값을 보여줌 default = 5
df.tail()

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [28]:
# 다른 확장자로도 파일 호출가능
df_txt = pd.read_csv('data/student2.txt')
df_txt

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [29]:
# 다른 구분자로 데이터를 구분하여 불러올 경우
df3 = pd.read_csv('data/student3.txt', delimiter = '\t')
df3

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [30]:
# 데이터의 첫 줄에 hearder 정보가 없는경우
df4 = pd.read_csv('data/student4.txt')
df4

Unnamed: 0,John,20,student
0,Jenny,30,developer
1,Nate,30,teacher
2,Julia,40,dentist
3,Brian,45,manager
4,Chris,25,intern


In [31]:
df4 = pd.read_csv('data/student4.txt', header = None)
df4

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [32]:
df4.columns = ['name', 'age', 'job']
df4

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [34]:
df4 = pd.read_csv('data/student4.txt', header = None, names = ['name', 'age', 'job'])
df4

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


## DataFrame 생성

In [35]:
import pandas as pd

In [37]:
friend_dict_list = [
    {'name':'John', 'age':25, 'job':'student'},
    {'name':'Nate', 'age':30, 'job':'teacher'}
]

In [38]:
df = pd.DataFrame(friend_dict_list)
df

Unnamed: 0,name,age,job
0,John,25,student
1,Nate,30,teacher


In [39]:
# 기존에 작성한 데이터와 컬럼의 순서가 다름
df = df[['name', 'age', 'job']]
df

Unnamed: 0,name,age,job
0,John,25,student
1,Nate,30,teacher


In [40]:
# Key의 순서를 보장
from collections import OrderedDict

In [43]:
friend_ordered_dict = OrderedDict(
    [
        ('name', ['Jone', 'Nate']),
        ('age', [25, 30]),
        ('job', ['Student', 'Teacher']),
    ]
)

In [44]:
df = pd.DataFrame.from_dict(friend_ordered_dict)
df

Unnamed: 0,name,age,job
0,Jone,25,Student
1,Nate,30,Teacher


In [45]:
friend_list = [
    ['John', 20, 'Student'],
    ['Nate', 30, 'Teacher'],
]

In [46]:
column_name = ['name', 'age', 'job']

In [49]:
df = pd.DataFrame.from_records(friend_list, columns = column_name)
df

Unnamed: 0,name,age,job
0,John,20,Student
1,Nate,30,Teacher


In [50]:
friend_list = [
    ['name', ['John', 'Nate']],
    ['age', [20, 30]],
    ['job', ['Student', 'Teacher']],
]

In [54]:
# df = pd.DataFrame.from_items(friend_list)
df = pd.DataFrame.from_dict(dict(friend_list))
df

Unnamed: 0,name,age,job
0,John,20,Student
1,Nate,30,Teacher


## Write DataFrame To File

In [57]:
friends = [
    {'name':'John', 'age':20, 'job':'student'},
    {'name':'Jenny', 'age':30, 'job':None},
    {'name':'Nate', 'age':30, 'job':'teacher'},
]
df = pd.DataFrame(friends)
df = df[['name', 'age', 'job']]
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,
2,Nate,30,teacher


In [60]:
# csv 파일로 저장, index=True, hearder = True는 Default 값
# 값들 중에 None이 있는 경우에는 Default 값은 빈칸이다
df.to_csv('data/friends.csv', index= True, header = True)
df.to_csv('data/friends2.csv', index= False, header = False)
df.to_csv('data/friends.csv', index=True, header=True, na_rep='None')

## Pandas - Select, Filter Rows or Columns

In [62]:
friend_list = [
    ['name', ['John', 'Jenny', 'Nate']],
    ['age', [20, 30, 30]],
    ['job', ['student', 'developer', 'teacher']]
]
# df = pd.DataFrame.from_items(friend_list)
df = pd.DataFrame.from_dict(dict(friend_list))
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


In [63]:
# Indexing
df[1:3]

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [64]:
# 특정 row만 추출, 불연속적
df.loc[ [0, 2] ] 

Unnamed: 0,name,age,job
0,John,20,student
2,Nate,30,teacher


### by column condition

In [65]:
df[df.age > 25]

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [66]:
df.query('age>25')

Unnamed: 0,name,age,job
1,Jenny,30,developer
2,Nate,30,teacher


In [71]:
df[(df.age>25) & (df.name == 'Nate')]

Unnamed: 0,name,age,job
2,Nate,30,teacher


### Filter Columns

In [73]:
friend_list = [
    ['John', 20, 'student'],
    ['Jenny', 30, 'developer'],
    ['Nate', 30, 'teacher']
]
df = pd.DataFrame.from_records(friend_list)
df

Unnamed: 0,0,1,2
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher


In [74]:
# by columns index
# df.iloc[row, col]
df.iloc[:,0:2]

Unnamed: 0,0,1
0,John,20
1,Jenny,30
2,Nate,30


In [75]:
df.iloc[0:2, 0:2]

Unnamed: 0,0,1
0,John,20
1,Jenny,30


In [76]:
# by column name
df = pd.read_csv('data/student4.txt', header=None, names=['name', 'age', 'job'])
df

Unnamed: 0,name,age,job
0,John,20,student
1,Jenny,30,developer
2,Nate,30,teacher
3,Julia,40,dentist
4,Brian,45,manager
5,Chris,25,intern


In [77]:
df_filtered = df[['name', 'age']]

In [78]:
df_filtered

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [80]:
df.filter(items=['age', 'job'])

Unnamed: 0,age,job
0,20,student
1,30,developer
2,30,teacher
3,40,dentist
4,45,manager
5,25,intern


In [81]:
df.filter(like='a', axis=1)
# axis : 축의 방향 column = 1, row = 0

Unnamed: 0,name,age
0,John,20
1,Jenny,30
2,Nate,30
3,Julia,40
4,Brian,45
5,Chris,25


In [84]:
df.filter(regex='b$', axis=1)

Unnamed: 0,job
0,student
1,developer
2,teacher
3,dentist
4,manager
5,intern


## Drop row or column

In [85]:
friends = [{'age': 15, 'job':'student'},
          {'age': 25, 'job':'developer'},
          {'age': 30, 'job':'teacher'},]

df = pd.DataFrame(friends,
                 index = ['John', 'Jenny', 'Nate'],
                 columns = ['age', 'job'])

In [86]:
df

Unnamed: 0,age,job
John,15,student
Jenny,25,developer
Nate,30,teacher


In [87]:
# 원본 데이터를 수정하려면 직접 할당해주어야한다.
df.drop(['John', 'Nate'])

Unnamed: 0,age,job
Jenny,25,developer


In [89]:
# 또는 .drop에 inplace = True를 사용함으로써 바로 처리가능하다
df.drop(['John', 'Nate'], inplace=True)
df

Unnamed: 0,age,job
Jenny,25,developer


In [92]:
friends = [{'name':'John', 'age': 15, 'job':'student'},
          {'name':'Ben', 'age': 25, 'job':'developer'},
          {'name':'Jenny', 'age': 30, 'job':'teacher'},]

df = pd.DataFrame(friends,
                 columns = ['name', 'age', 'job'])
df

Unnamed: 0,name,age,job
0,John,15,student
1,Ben,25,developer
2,Jenny,30,teacher


In [93]:
# drop by index
df.drop(df.index[[0,2]])

Unnamed: 0,name,age,job
1,Ben,25,developer


In [95]:
# drop by column value
df[df.age > 20]

Unnamed: 0,name,age,job
1,Ben,25,developer
2,Jenny,30,teacher


In [96]:
# drop column
df.drop('age', axis=1)

Unnamed: 0,name,job
0,John,student
1,Ben,developer
2,Jenny,teacher
