In [135]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings

plt.rc('font', family = 'Malgun Gothic')
filterwarnings('ignore')

# 전체 데이터 출력 및 확인 옵션 설정
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.options.display.max_info_columns

100

In [136]:
df = pd.read_csv("dataset/2023bigcontest_advanced_sac.csv")[['seat','discount_type']]
df.sample(3)

Unnamed: 0,seat,discount_type
1672796,1층 11열 1,일반
1236017,2층 B블록1열 5,일반
292305,2층 3열 16,초대권


In [137]:
df['seat_floor'] = df['seat'].str.extract(r'(\d+)층', expand=False)
df[['seat','seat_floor']]

Unnamed: 0,seat,seat_floor
0,3층 BOX9 10,3
1,1층 7열 5,1
2,1층 C블록 16열 3,1
3,1층 2열 3,1
4,1층 B블록12열 7,1
...,...,...
1920863,1층 A블록 14열 6,1
1920864,3층 B블록5열 4,3
1920865,1층 7열 3,1
1920866,1층 B블록 9열 5,1


In [138]:
#층수의 unique를 찍었을 때 nan 값 존재
df['seat_floor'].unique()

array(['3', '1', '2', nan], dtype=object)

In [139]:
# 빈 값들은 '합창석'
df.query('`seat_floor` != `seat_floor`')[['seat','seat_floor']]

Unnamed: 0,seat,seat_floor
90,합창석 G블록2열 24,
222,합창석 G블록3열 9,
247,합창석 H블록3열 9,
259,합창석 H블록3열 13,
344,합창석 H블록4열 6,
...,...,...
1920666,합창석 G블록4열 19,
1920681,합창석 H블록3열 19,
1920745,합창석 H블록4열 6,
1920811,합창석 G블록1열 20,


In [140]:
#fillna를 이용해 합창석으로 채워주기
df['seat_floor'] = df['seat_floor'].fillna("합창석")
df.sample(5)

Unnamed: 0,seat,discount_type,seat_floor
975153,1층 2열 15,초대권,1
1590365,2층 BOX4 1열 7,초대권,2
1265544,1층 C블록 15열 6,기획사,1
644346,1층 D블록22열 8,기획사판매,1
689829,1층 A블록12열 6,초대권,1


In [141]:
df.sample(5)

Unnamed: 0,seat,discount_type,seat_floor
1157420,1층 E블록18열 8,기획사판매,1
1663591,2층 BOX3 1열 10,초대권,2
778390,합창석 H블록2열 5,블루회원 할인5%,합창석
1003231,1층 B블록 16열 16,기획사판매,1
51172,2층 BOX2 3,BC카드 할인15%,2


In [148]:
df = df.copy()
#seat 뒤에 공백을 추가해서 정규표현식으로 추출
df['seat'] = df['seat'].astype(str) + ' '
df['seat_num'] = df['seat'].str.extract(r'\s(\d+)\s', expand=False)
df[['seat','seat_floor','seat_num']]

Unnamed: 0,seat,seat_floor,seat_num
0,3층 BOX9 10,3,10
1,1층 7열 5,1,5
2,1층 C블록 16열 3,1,3
3,1층 2열 3,1,3
4,1층 B블록12열 7,1,7
...,...,...,...
1920863,1층 A블록 14열 6,1,6
1920864,3층 B블록5열 4,3,4
1920865,1층 7열 3,1,3
1920866,1층 B블록 9열 5,1,5


In [160]:
#이제 블록, 열이 남았음 : 이것만 따로 선택해서 추출
df1 = df.copy()
df1['seat_blockcol'] = df1['seat'].str.split(" ")
df1.head()


Unnamed: 0,seat,discount_type,seat_floor,seat_num,seat_blockcol
0,3층 BOX9 10,일반,3,10,"[3층, BOX9, 10, ]"
1,1층 7열 5,초대권,1,5,"[1층, 7열, 5, ]"
2,1층 C블록 16열 3,초대권,1,3,"[1층, C블록, 16열, 3, ]"
3,1층 2열 3,초대권,1,3,"[1층, 2열, 3, ]"
4,1층 B블록12열 7,일반,1,7,"[1층, B블록12열, 7, ]"


In [156]:
# 여기서 deque을 써볼까?
import collections as co
d = co.deque([])
for i in range():
    d.append(df1['seat_blockcol'])

1


In [46]:
# seat_floor : (\d+)층|합창석
# seat_block : BOX*(\s*)\w*|[A-Z]*(\s*)블록
# seat_col : (\d+)열
# seat_num : (\d+)

def separate_seat_new(df):
    # 정규표현식 수정
    # df[['seat_floor', 'seat_block', 'seat_col', 'seat_num']] = df['seat'].str.extract(r'(\d+)층|([A-Z가-힣]+) *(\d+)열 *(\d+)')
    df[['seat_floor', 'seat_block_col', 'seat_num']] = df['seat'].str.extract(r'(\d+)층\s*([A-Z]*\w[가-힣]+)?(\s*\d+열)?\s*(\d+)')
    # '합창석' 처리
    df['seat_floor'] = df['seat_floor'].fillna('합창석')
    
    # '열' 처리
    df['seat_col'] = df['seat_col'].str.extract(r'(\d+)', expand=False)
    
    return df

df_seat = separate_seat_new(df)
df_seat[['seat','seat_floor','seat_block','seat_col','seat_num']]

Unnamed: 0,seat,seat_floor,seat_block,seat_col,seat_num
0,3층 BOX9 10,합창석,,,
1,1층 7열 5,1,7열,,5
2,1층 C블록 16열 3,1,C블록,16,3
3,1층 2열 3,1,2열,,3
4,1층 B블록12열 7,1,B블록,12,7
...,...,...,...,...,...
1920863,1층 A블록 14열 6,1,A블록,14,6
1920864,3층 B블록5열 4,3,B블록,5,4
1920865,1층 7열 3,1,7열,,3
1920866,1층 B블록 9열 5,1,B블록,9,5


In [62]:
# df['seat']의 데이터야.
# '2층 우측 1열 14', '3층 G블록7열 5', '2층 A블록 1열 12', '1층 BOX1 1열 9', 
# '2층 B블록7열 3', '1층 C블록16열 12', '1층 BOX1 1열 15',
# '합창석 G블록1열 19', '1층 C블록9열 11', '2층 BOX5 7', '1층 1열 16'

# ---  
# 이 데이터를 아래와 같이 나누어 데이터프레임의 열에 담고 싶어.  seat_floor의 합창석, seat_block의 우측, 알파벳, BOX숫자에 유의해줘.
# #seat_floor : 2,3,2,1,2,1,1,합창석,1,2,1
# #seat_block : 우측,G,A,BOX1,B,C,BOX1,G,C,BOX5,
# #seat_col : 1,7,1,1,7,16,1,1,9,,1
# #seat_num : 14,5,12,9,3,12,15,19,11,7,16

# ---
# 내 코드를 수정해줘.

# def separate_seat(df):    
#     df[['seat_floor', 'seat_block', 'seat_col', 'seat_num']] = df['seat'].str.extract(r'(\d+)층|합창석 BOX*(\s*)\w*|[A-Z]*(\s*)블록 (\d+)열 (\d+)')
#     df['seat_col'] = df['seat_block'].str.extract(r'(\d+열)', expand=False)
#     df['seat_block'] = df['seat_block'].str.extract(r'[A-Z]*\d+[가-힣]', expand=False)
#     #마지막으로 seat_block에서는 '블록', seat_col에서는 '열'이라는 글자 제거하기
#     df['seat_block'] = df['seat_block'].str.replace('블록', '', regex=True)
#     df['seat_col'] = df['seat_col'].str.replace('열', '', regex=True)
#     # df_new = df.drop(['seat'], axis=1)
#     return df

# df_seat = separate_seat(df)
# df_seat[['seat','seat_floor','seat_block','seat_col','seat_num']]