## Pandas

In [1]:
import pandas as pd

## Series 생성 및 속성

In [2]:
data = [10, 30, 20]
sr = pd.Series(data)
sr

Unnamed: 0,0
0,10
1,30
2,20


In [3]:
sr.index, sr.values # 튜플로 인식

(RangeIndex(start=0, stop=3, step=1), array([10, 30, 20]))

In [4]:
sr.name = 'Score' # name 변경
sr

Unnamed: 0,Score
0,10
1,30
2,20


In [5]:
sr.shape

(3,)

In [6]:
data = [10, 30, 20]
sr = pd.Series(data, index=['a','b','c']) # 인덱스 수동 지정
sr

Unnamed: 0,0
a,10
b,30
c,20


In [7]:
data = {'a': 10, 'b': 20, 'c':20}
pd.Series(data)

Unnamed: 0,0
a,10
b,20
c,20


## DataFrame 생성과 속성



In [8]:
data = {
    'Name':['Alice','Bob','Charlie','John'],
    'Age':[25,30,None,40], # 결측치 -> None
    'City':['New York','San Francisco','Los Angeles','Seoul']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,San Francisco
2,Charlie,,Los Angeles
3,John,40.0,Seoul


In [9]:
df.shape

(4, 3)

In [10]:
df.head() # 상위 5개 데이터만

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,San Francisco
2,Charlie,,Los Angeles
3,John,40.0,Seoul


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      object 
 1   Age     3 non-null      float64
 2   City    4 non-null      object 
dtypes: float64(1), object(2)
memory usage: 228.0+ bytes


## 필터링 및 슬라이싱

In [12]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,San Francisco
2,Charlie,,Los Angeles
3,John,40.0,Seoul


In [13]:
df['Name']

Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie
3,John


In [14]:
type(df['Name']) # Series

In [15]:
df[['Name','Age']]

Unnamed: 0,Name,Age
0,Alice,25.0
1,Bob,30.0
2,Charlie,
3,John,40.0


In [16]:
type(df[['Name','Age']]) # DataFrame

In [17]:
type(df[['Name']]) # DataFrame -> 읽어올 때 리스트로 읽어왔으므로

In [18]:
df.Name, df['Name'] # 한글이거나 띄어쓰기 있으면 첫번째꺼 인식 안됨

(0      Alice
 1        Bob
 2    Charlie
 3       John
 Name: Name, dtype: object,
 0      Alice
 1        Bob
 2    Charlie
 3       John
 Name: Name, dtype: object)

In [19]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,San Francisco
2,Charlie,,Los Angeles
3,John,40.0,Seoul


In [20]:
df['Age']>29

Unnamed: 0,Age
0,False
1,True
2,False
3,True


In [21]:
df[df['Age']>29] # 불리언 인덱싱

Unnamed: 0,Name,Age,City
1,Bob,30.0,San Francisco
3,John,40.0,Seoul


In [22]:
df.isna() # 결측치 여부 확인

Unnamed: 0,Name,Age,City
0,False,False,False
1,False,False,False
2,False,True,False
3,False,False,False


In [23]:
df[df['Age'].isna()]

Unnamed: 0,Name,Age,City
2,Charlie,,Los Angeles


In [24]:
df[df['Age'].notna()] # 결측치(NaN)가 아닌 값만 뽑아옴

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,San Francisco
3,John,40.0,Seoul


In [25]:
df[(df['Age'].notna()) & (df['City']=='Seoul')] # 결측치(NaN)가 아닌 값만 뽑아옴
# df[(df['Age'].notna()) | (df['City']=='Seoul')] # 결측치(NaN)가 아닌 값만 뽑아옴

Unnamed: 0,Name,Age,City
3,John,40.0,Seoul


- iloc
- loc

In [26]:
df.iloc[0, 0] # row, col

'Alice'

In [27]:
df.loc[0:2, ["Name","Age"]] # 넘파이에서는 2 인덱스는 포함 안됨
# loc일 때만 2 인덱스 포함됨

Unnamed: 0,Name,Age
0,Alice,25.0
1,Bob,30.0
2,Charlie,


In [28]:
df.loc[df["Age"]>29, ["Name","Age"]] # loc에서는 앞에 조건 넣을 수 있음

Unnamed: 0,Name,Age
1,Bob,30.0
3,John,40.0


In [29]:
df['Age']*2

Unnamed: 0,Age
0,50.0
1,60.0
2,
3,80.0


In [30]:
data = {
    'Name':['Alice','Bob','Charlie','John'],
    'Age':[25,30,None,40], # 결측치 -> None
    'City':['New York','San Francisco','Los Angeles','Seoul']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,San Francisco
2,Charlie,,Los Angeles
3,John,40.0,Seoul


In [31]:
df['New Age'] = df['Age']*2
df

Unnamed: 0,Name,Age,City,New Age
0,Alice,25.0,New York,50.0
1,Bob,30.0,San Francisco,60.0
2,Charlie,,Los Angeles,
3,John,40.0,Seoul,80.0


In [32]:
df.drop(['Age'], axis=1) # drop할 때 축 정해줘야 함

Unnamed: 0,Name,City,New Age
0,Alice,New York,50.0
1,Bob,San Francisco,60.0
2,Charlie,Los Angeles,
3,John,Seoul,80.0


In [33]:
df # drop한 값 그대로 있음

Unnamed: 0,Name,Age,City,New Age
0,Alice,25.0,New York,50.0
1,Bob,30.0,San Francisco,60.0
2,Charlie,,Los Angeles,
3,John,40.0,Seoul,80.0


In [34]:
df.drop(['Age'], axis=1, inplace=True) # inplace 최대한 사용하지 마셈
df # df에서 drop한 값 사라짐

Unnamed: 0,Name,City,New Age
0,Alice,New York,50.0
1,Bob,San Francisco,60.0
2,Charlie,Los Angeles,
3,John,Seoul,80.0


In [35]:
df.dropna()

Unnamed: 0,Name,City,New Age
0,Alice,New York,50.0
1,Bob,San Francisco,60.0
3,John,Seoul,80.0


In [36]:
df # 원본 그대로

Unnamed: 0,Name,City,New Age
0,Alice,New York,50.0
1,Bob,San Francisco,60.0
2,Charlie,Los Angeles,
3,John,Seoul,80.0


In [37]:
# df.fillna(0) # NaN에 모두 0 넣기
df['New Age'].fillna(0)

Unnamed: 0,New Age
0,50.0
1,60.0
2,0.0
3,80.0


In [38]:
df

Unnamed: 0,Name,City,New Age
0,Alice,New York,50.0
1,Bob,San Francisco,60.0
2,Charlie,Los Angeles,
3,John,Seoul,80.0


In [39]:
df = df['New Age'].fillna(0)
df

Unnamed: 0,New Age
0,50.0
1,60.0
2,0.0
3,80.0


# 데이터 집계 및 그룹화

In [40]:
data = {
    'Name':['Alice','Bob','Charlie','Jane'],
    'Age':[25,30,28,40],
    'Gender':[1, 0, 1, 0]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender
0,Alice,25,1
1,Bob,30,0
2,Charlie,28,1
3,Jane,40,0


In [41]:
df.groupby('Gender')['Age'].sum() # Gender별로 묶어서 age의 합

Unnamed: 0_level_0,Age
Gender,Unnamed: 1_level_1
0,70
1,53


In [42]:
df.groupby('Gender')['Age'].mean() # Gender별로 묶어서 age의 평균

Unnamed: 0_level_0,Age
Gender,Unnamed: 1_level_1
0,35.0
1,26.5


In [43]:
data = {
    'Name':['Alice','Bob','Charlie','Jane'],
    'Age':[25,30,28,40],
    'Gender':[1, 0, 1, 0],
    'Area': ['A','B','A','B']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender,Area
0,Alice,25,1,A
1,Bob,30,0,B
2,Charlie,28,1,A
3,Jane,40,0,B


In [44]:
df.groupby(['Gender', 'Area'])['Age'].sum() # Gender and Area별로 묶어서 age의 평균

Unnamed: 0_level_0,Unnamed: 1_level_0,Age
Gender,Area,Unnamed: 2_level_1
0,B,70
1,A,53
