In [1]:
import pandas as pd

# 합친 데이터를 저장할 데이터프레임
data = pd.DataFrame(data=[], columns=['beer_info', 'search_name', 'beer_name'])

# 수집한 파일의 개수
num_files = 2

for i in range(num_files):

    # 해당 경로에서 beer_n_1.csv 형식의 파일들만 수집한 뒤 병합합니다.
    try : 
        temp = pd.read_csv(f'beer_n_{i}.csv', index_col=0)
        data = pd.concat([data, temp])
    # 오류 발생 시 넘어갑니다.
    except :
        print(i, 'error.. move to the next file.')

# 합친 데이터를 저장합니다.
data.to_csv('final_data.csv', encoding='utf-8')

In [2]:
data = pd.read_csv('final_data.csv', encoding='utf-8', index_col=0)

data = data[['beer_name', 'beer_info']]

data

Unnamed: 0,beer_name,beer_info
0,San Miguel,"czgr(819)\n🇵🇱Opole, Poland\n1.6January 27, 202..."
1,San Miguel,"ResinousMaestro55(268)\n🇬🇧Hampton, England\n2...."
2,San Miguel,"nimbleprop(11,986)\n🇺🇸Southeast, Washington, U..."
3,San Miguel,"troopie(4,238)\n🇺🇸Munchen an der Willamette, U..."
4,San Miguel,"BubbleTamer35(1)\n1.9May 28, 2019\nStrong tast..."
...,...,...
3950,Newcastle Brown Ale (Non-US Version),"Frog King99(51)\n🇺🇸L.A., United States\n4.2May..."
3951,Newcastle Brown Ale (Non-US Version),"zbrew19(31)\n🇺🇸Philadelphia, United States\n3...."
3952,Newcastle Brown Ale (Non-US Version),"Aubrey(3,516)\n🇺🇸Bellingham, United States\n3...."
3953,Newcastle Brown Ale (Non-US Version),"wade(59)\n🇺🇸Decatur, United States\n3.9April 2..."


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4148 entries, 0 to 3954
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   beer_name  4148 non-null   object
 1   beer_info  4148 non-null   object
dtypes: object(2)
memory usage: 97.2+ KB


# Transform

In [4]:
data['beer_info'].iloc[0]

'czgr(819)\n🇵🇱Opole, Poland\n1.6January 27, 2021\nPite jakiś czas temu. Ocena w celu uzupełnienia profilu. \nhttps://www.instagram.com/czgr_\nAroma\n-\nAppearance\n-\nFlavor\n-\nMouthfeel\n-\nOverall\n-'

In [5]:
# Copy the original dataframe
temp = data.copy()

# \n 개행문자 기준으로 분리
temp['beer_info'] = temp['beer_info'].str.split('\n')
temp['beer_info']

0       [czgr(819), 🇵🇱Opole, Poland, 1.6January 27, 20...
1       [ResinousMaestro55(268), 🇬🇧Hampton, England, 2...
2       [nimbleprop(11,986), 🇺🇸Southeast, Washington, ...
3       [troopie(4,238), 🇺🇸Munchen an der Willamette, ...
4       [BubbleTamer35(1), 1.9May 28, 2019, Strong tas...
                              ...                        
3950    [Frog King99(51), 🇺🇸L.A., United States, 4.2Ma...
3951    [zbrew19(31), 🇺🇸Philadelphia, United States, 3...
3952    [Aubrey(3,516), 🇺🇸Bellingham, United States, 3...
3953    [wade(59), 🇺🇸Decatur, United States, 3.9April ...
3954    [billb(245), 🇺🇸Acworth, United States, 4.2Apri...
Name: beer_info, Length: 4148, dtype: object

In [6]:
# 분리된 스트링 값 확인 -> List로 변환됨
temp['beer_info'].iloc[0]

['czgr(819)',
 '🇵🇱Opole, Poland',
 '1.6January 27, 2021',
 'Pite jakiś czas temu. Ocena w celu uzupełnienia profilu. ',
 'https://www.instagram.com/czgr_',
 'Aroma',
 '-',
 'Appearance',
 '-',
 'Flavor',
 '-',
 'Mouthfeel',
 '-',
 'Overall',
 '-']

In [7]:
# 새로운 데이터 프레임 ttmp에 copy()후 전처리 하겠습니다.
ttmp = temp.copy()
# 맥주정보 리스트 출력 : 좋아요 수가 기록된 유저 정보
ttmp['beer_info'].iloc[10]

['drfabulous(13,307)',
 '🇺🇸Lexington, United States',
 '2.5July 27, 2018',
 '12 ounce can on a plane. Nothing special here. Same old plain tasteless or slightly sweet cheap lager.',
 'Aroma',
 '5',
 'Appearance',
 '2',
 'Flavor',
 '2',
 'Mouthfeel',
 '3',
 'Overall',
 '13']

In [8]:
# 전체 데이터프레임에서 좋아요가 1개인 것 찾아서 맨 뒤에 것 삭제
ttmp['beer_info'] = ttmp['beer_info'].apply(lambda x : x if x[-2]=='Overall' else x[:-1] )

In [9]:
# 맥주정보에서 0,1,2,3번째 리스트 요소와 뒤에서부터 10개의 리스트요소(평점값들)추출
ttmp['beer_info'] = ttmp['beer_info'].apply(lambda x : x[:4]+x[:-11:-1])

# 좋아요 수가 정상적으로 삭제됨
ttmp['beer_info'].iloc[10]

['drfabulous(13,307)',
 '🇺🇸Lexington, United States',
 '2.5July 27, 2018',
 '12 ounce can on a plane. Nothing special here. Same old plain tasteless or slightly sweet cheap lager.',
 '13',
 'Overall',
 '3',
 'Mouthfeel',
 '2',
 'Flavor',
 '2',
 'Appearance',
 '5',
 'Aroma']

In [10]:
# 맨 첫번째 리스트 요소에 ID 저장
# 그 뒤로는 뒤에서부터 각 평가값 저장
ttmp['user'] = ttmp['beer_info'].apply(lambda x: x[0])
ttmp['aroma'] = ttmp['beer_info'].apply(lambda x: x[-2])
ttmp['appearance'] = ttmp['beer_info'].apply(lambda x: x[-4])
ttmp['flavor'] = ttmp['beer_info'].apply(lambda x: x[-6])
ttmp['mouthfeel'] = ttmp['beer_info'].apply(lambda x: x[-8])
ttmp['overall'] = ttmp['beer_info'].apply(lambda x: x[-10])

# 리스트의 1,2,3번째 요소만(평점날짜 or 이상한 값) 뽑아오기
ttmp['beer_info'] = ttmp['beer_info'].apply(lambda x:x[1:4])
ttmp['length'] = ttmp['beer_info'].apply(lambda x:len(x))

# 결과 확인
ttmp.head(3)

Unnamed: 0,beer_name,beer_info,user,aroma,appearance,flavor,mouthfeel,overall,length
0,San Miguel,"[🇵🇱Opole, Poland, 1.6January 27, 2021, Pite ja...",czgr(819),-,-,-,-,-,3
1,San Miguel,"[🇬🇧Hampton, England, 2.5January 15, 2021, Fizz...",ResinousMaestro55(268),5,3,5,3,9,3
2,San Miguel,"[🇺🇸Southeast, Washington, United States, 2.5Fe...","nimbleprop(11,986)",5,3,5,2,10,3


In [11]:
# 4.0+알파벳으로 처리된 텍스트를 뽑아내기 위해 정규표현식 사용

import re

# ex) 4.0December 28, 2020 추출
reg = re.compile('[0-9]+.+[0-9]+[A-Za-z0-9]*')

In [12]:
# 정규표현식에 해당하는 문자열과 매칭되는 경우 해당 리스트 요소를 맥주정보에 저장
# reg.match()는 re.compile()의 정규표현식과 일치하는 문자열을 반환, 아니면 False를 반환
# 중첩 삼항 표현식 사용 : https://ooyoung.tistory.com/116

ttmp['beer_info'] = ttmp['beer_info'].apply(lambda x: x[0] if reg.match(x[0]) else 
                                  (x[1] if reg.match(x[1]) else x[2]))

# 결과 확인
ttmp.head()

Unnamed: 0,beer_name,beer_info,user,aroma,appearance,flavor,mouthfeel,overall,length
0,San Miguel,"1.6January 27, 2021",czgr(819),-,-,-,-,-,3
1,San Miguel,"2.5January 15, 2021",ResinousMaestro55(268),5,3,5,3,9,3
2,San Miguel,"2.5February 9, 2020","nimbleprop(11,986)",5,3,5,2,10,3
3,San Miguel,"1.3June 13, 2019","troopie(4,238)",2,3,3,1,4,3
4,San Miguel,"1.9May 28, 2019",BubbleTamer35(1),2,4,3,4,6,3


In [13]:
# 평점은 0번째부터 3번째, 날짜는 그 이후 문자열로 처리
ttmp['rating'] = ttmp['beer_info'].apply(lambda x : x[:3])
ttmp['date'] = ttmp['beer_info'].apply(lambda x : x[3:])

ttmp.head()

Unnamed: 0,beer_name,beer_info,user,aroma,appearance,flavor,mouthfeel,overall,length,rating,date
0,San Miguel,"1.6January 27, 2021",czgr(819),-,-,-,-,-,3,1.6,"January 27, 2021"
1,San Miguel,"2.5January 15, 2021",ResinousMaestro55(268),5,3,5,3,9,3,2.5,"January 15, 2021"
2,San Miguel,"2.5February 9, 2020","nimbleprop(11,986)",5,3,5,2,10,3,2.5,"February 9, 2020"
3,San Miguel,"1.3June 13, 2019","troopie(4,238)",2,3,3,1,4,3,1.3,"June 13, 2019"
4,San Miguel,"1.9May 28, 2019",BubbleTamer35(1),2,4,3,4,6,3,1.9,"May 28, 2019"


In [14]:
ttmp.drop(['beer_info', 'length'], axis=1, inplace=True)
ttmp.head()

Unnamed: 0,beer_name,user,aroma,appearance,flavor,mouthfeel,overall,rating,date
0,San Miguel,czgr(819),-,-,-,-,-,1.6,"January 27, 2021"
1,San Miguel,ResinousMaestro55(268),5,3,5,3,9,2.5,"January 15, 2021"
2,San Miguel,"nimbleprop(11,986)",5,3,5,2,10,2.5,"February 9, 2020"
3,San Miguel,"troopie(4,238)",2,3,3,1,4,1.3,"June 13, 2019"
4,San Miguel,BubbleTamer35(1),2,4,3,4,6,1.9,"May 28, 2019"


In [15]:
ttmp.rating.unique()

array(['1.6', '2.5', '1.3', '1.9', '2.1', '2.2', '2.6', '1.8', '2.0',
       '1.4', '3.3', '1.1', '3.0', '5.0', '1.0', '2.7', '0.5', '1.2',
       '0.8', '3.1', '1.5', '1.7', '2.9', '2.4', '0.9', '2.3', '3.2',
       '4.3', '2.8', '3.4', '3.9', '3.5', '4.5', '3.7', '4.4', '3.6',
       '4.1', '4.0', '3.8', '4.2', '4.7', '4.8', '4.9', '4.6', '0.7',
       '0.6'], dtype=object)

In [16]:
ttmp.aroma.unique()

array(['-', '5', '2', '4', '3', '7', '10', '1', '6', '8', '9'],
      dtype=object)

In [17]:
# 세부 리뷰 값이 '-'가 아닌 데이터만 저장
ttmp = ttmp[ttmp['aroma']!='-']
ttmp = ttmp[ttmp['appearance']!='-']
ttmp = ttmp[ttmp['flavor']!='-']
ttmp = ttmp[ttmp['mouthfeel']!='-']
ttmp = ttmp[ttmp['overall']!='-']
ttmp[ttmp['aroma']=='-']

Unnamed: 0,beer_name,user,aroma,appearance,flavor,mouthfeel,overall,rating,date


In [18]:
ttmp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4124 entries, 1 to 3954
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   beer_name   4124 non-null   object
 1   user        4124 non-null   object
 2   aroma       4124 non-null   object
 3   appearance  4124 non-null   object
 4   flavor      4124 non-null   object
 5   mouthfeel   4124 non-null   object
 6   overall     4124 non-null   object
 7   rating      4124 non-null   object
 8   date        4124 non-null   object
dtypes: object(9)
memory usage: 322.2+ KB


In [19]:
# 수치형 데이터는 실수로 변환 : pd.to_numeric() 함수 사용
ttmp['rating'] = pd.to_numeric(ttmp['rating'])
ttmp['aroma'] = pd.to_numeric(ttmp['aroma'])
ttmp['appearance'] = pd.to_numeric(ttmp['appearance'])
ttmp['flavor'] = pd.to_numeric(ttmp['flavor'])
ttmp['mouthfeel'] = pd.to_numeric(ttmp['mouthfeel'])
ttmp['overall'] = pd.to_numeric(ttmp['overall'])
# 중복된 행들을 제거합니다.
ttmp.drop_duplicates(keep='first', inplace=True)

# 최종 데이터 확인
ttmp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4124 entries, 1 to 3954
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   beer_name   4124 non-null   object 
 1   user        4124 non-null   object 
 2   aroma       4124 non-null   int64  
 3   appearance  4124 non-null   int64  
 4   flavor      4124 non-null   int64  
 5   mouthfeel   4124 non-null   int64  
 6   overall     4124 non-null   int64  
 7   rating      4124 non-null   float64
 8   date        4124 non-null   object 
dtypes: float64(1), int64(5), object(3)
memory usage: 322.2+ KB


In [20]:
# 최종 데이터 값 분포 확인
ttmp.describe()

Unnamed: 0,aroma,appearance,flavor,mouthfeel,overall,rating
count,4124.0,4124.0,4124.0,4124.0,4124.0,4124.0
mean,5.332687,3.370999,5.827837,2.981329,11.898642,2.941222
std,1.615739,0.834377,1.667757,0.835712,3.345119,0.707003
min,1.0,1.0,1.0,1.0,1.0,0.5
25%,4.0,3.0,5.0,2.0,10.0,2.5
50%,5.0,3.0,6.0,3.0,12.0,3.0
75%,6.0,4.0,7.0,3.0,14.0,3.3
max,10.0,5.0,10.0,5.0,20.0,5.0


In [21]:
ttmp.to_csv('final_data.csv', encoding='utf-8')