# Tent review 데이터 클렌징

In [58]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import mean, col, split, regexp_extract, when, lit
# ML
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, IndexToString
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import os
import pandas as pd
import numpy as np
import re

In [61]:
# Spark 설정
conf = SparkConf().setMaster("local").setAppName("uber-date-trips")
sc = SparkContext(conf=conf)

In [62]:
sc.version

'3.2.1'

## 스파크 세션 만들기

In [77]:
os.chdir('/home/ec2-user/notebook-work/miran/cleansing')

In [78]:
spark = SparkSession\
        .builder\
        .appName('tent-review-cleansing')\
        .getOrCreate()

In [79]:
df = spark.read.csv(os.getcwd() + '/tent-review.csv', inferSchema = True, header = True)

In [5]:
# df.toPandas()

Unnamed: 0,_c0,prd_id,prd_url,user_id,review,uploaded_date,quality_score,star_score,image_url
0,0,14981947918,https://smartstore.naver.com/main/products/307...,nana****,원래 원터치텐트만 구입하려고 마음먹은 사람입니다. 기존 패스트캠프 테라6 원터치 텐...,2022-05-02,0.894785,5,['https://phinf.pstatic.net/checkout/20220502_...
1,1,14981947918,https://smartstore.naver.com/main/products/307...,pyun****,좋아요.한여름에 창이 4개가 아니라 고생했지많요..<br>확실히 두개라서 바람이 잘...,2019-10-07,0.892367,3,
2,2,14981947918,https://smartstore.naver.com/main/products/307...,eori****,1.배송이 빨리왔어요^^<br>2.펼치고 접기 너무 쉬워요~<br> ㅡ사실 저희부...,2021-08-04,0.88553,5,['https://phinf.pstatic.net/checkout/20210804_...
3,3,14981947918,https://smartstore.naver.com/main/products/307...,ider****,구매하기전에 정말 고민 많이했어요~~ <br>아이와 캠핑가기 위해 설치가 편한 제품...,2022-05-24,0.880135,5,
4,4,14981947918,https://smartstore.naver.com/main/products/307...,rudc****,첫 캠핑이라 너무 설레서 <em>고민고민하다가 무엇을 사야하나 하고 </em>매장도...,2021-09-02,0.880015,5,['https://phinf.pstatic.net/checkout/20210902_...
...,...,...,...,...,...,...,...,...,...
181343,107236,2471807188,https://smartstore.naver.com/main/products/247...,1you******,가격대비 가성비 좋아요,2019-09-11 13:59:02,0.0,4,
181344,107237,2471807188,https://smartstore.naver.com/main/products/247...,hong*****,상품좋습니다 크기도생각보다큼,2019-09-08 00:46:23,0.0,5,
181345,107238,2471807188,https://smartstore.naver.com/main/products/247...,kooh******,색깔이 흰색인줄 알았는데 회색이더군요,,,,
181346,"아직 펴보진 않았는데 일단 가볍긴하네요""",2019-09-01 23:33:03,0.0,4,,,,,


In [94]:
df.show()

+---+-----------+--------------------+--------+----------------------------------+-------------+-------------+----------+--------------------+
|_c0|     prd_id|             prd_url| user_id|                            review|uploaded_date|quality_score|star_score|           image_url|
+---+-----------+--------------------+--------+----------------------------------+-------------+-------------+----------+--------------------+
|  0|14981947918|https://smartstor...|nana****|원래 원터치텐트만 구입하려고 마...|   2022-05-02|     0.894785|         5|['https://phinf.p...|
|  1|14981947918|https://smartstor...|pyun****|  좋아요.한여름에 창이 4개가 아...|   2019-10-07|     0.892367|         3|                 NaN|
|  2|14981947918|https://smartstor...|eori****|      1.배송이 빨리왔어요^^<br>...|   2021-08-04|      0.88553|         5|['https://phinf.p...|
|  3|14981947918|https://smartstor...|ider****|구매하기전에 정말 고민 많이했어...|   2022-05-24|     0.880135|         5|                 NaN|
|  4|14981947918|https://smartstor...|rudc****|

# 데이터 클렌징

## 정규식으로 클렌징

In [95]:
#한글만 남기기

from pyspark.sql.functions import regexp_replace

r = df.select(regexp_replace(col('review'), '[^\uAC00-\uD7A3]', ' ').alias('clean1'))

r.show()

+----------------------------------+
|                            clean1|
+----------------------------------+
|원래 원터치텐트만 구입하려고 마...|
|  좋아요 한여름에 창이  개가 아...|
|        배송이 빨리왔어요      ...|
|구매하기전에 정말 고민 많이했어...|
|    첫 캠핑이라 너무 설레서    ...|
|  캠핑 한번도 한적 없어요 설치 ...|
|  추석 이후 올줄 알았는데 완전 ...|
|이걸로 얼마전 태풍같이 바람불고...|
| 같은날 주문한 품목들중 일부 품...|
| 택배는 중간에 광복절 휴무가 끼...|
|             너무 좋네요       ...|
|  여름에 잘 썼습니다  그런데 육...|
| 제가 사용 해보니 간편하고 튼튼...|
|   일단 텐트가 아이둘 있는  인 ...|
|여름이라 편하게 다니려고 원터치...|
|간단하게 후기를 남기자면 텐트가...|
|    아이들도 크고     날씨도 좋...|
|남편이 체력거지 게으름뱅이 귀차...|
|    그냥 저냥    문이 두개다 보...|
|        캠프는 내취향이 아니야 ...|
+----------------------------------+
only showing top 20 rows



In [96]:
# 줄바꿈 제거

r = r.select(regexp_replace(col('clean1'),'\n', "").alias('clean1'))
                            
r.show()

+----------------------------------+
|                            clean1|
+----------------------------------+
|원래 원터치텐트만 구입하려고 마...|
|  좋아요 한여름에 창이  개가 아...|
|        배송이 빨리왔어요      ...|
|구매하기전에 정말 고민 많이했어...|
|    첫 캠핑이라 너무 설레서    ...|
|  캠핑 한번도 한적 없어요 설치 ...|
|  추석 이후 올줄 알았는데 완전 ...|
|이걸로 얼마전 태풍같이 바람불고...|
| 같은날 주문한 품목들중 일부 품...|
| 택배는 중간에 광복절 휴무가 끼...|
|             너무 좋네요       ...|
|  여름에 잘 썼습니다  그런데 육...|
| 제가 사용 해보니 간편하고 튼튼...|
|   일단 텐트가 아이둘 있는  인 ...|
|여름이라 편하게 다니려고 원터치...|
|간단하게 후기를 남기자면 텐트가...|
|    아이들도 크고     날씨도 좋...|
|남편이 체력거지 게으름뱅이 귀차...|
|    그냥 저냥    문이 두개다 보...|
|        캠프는 내취향이 아니야 ...|
+----------------------------------+
only showing top 20 rows



In [None]:
# 이중 스페이스 제거

r = r.select(regexp_replace(col('clean1'), '\s+', ' ').alias('clean1'))
                            
r.show()

In [97]:
#한글만 남기기
r = df.select(regexp_replace(col('review'), '[^\uAC00-\uD7A3]', ' ').alias('clean1'))

# 줄바꿈 제거
r = r.select(regexp_replace(col('clean1'),'\n', "").alias('clean1'))

# 이중 스페이스 제거
r = r.select(regexp_replace(col('clean1'), '\s+', ' ').alias('clean1'))

r.show

+----------------------------------+
|                            clean1|
+----------------------------------+
|원래 원터치텐트만 구입하려고 마...|
| 좋아요 한여름에 창이 개가 아니...|
|  배송이 빨리왔어요 펼치고 접기...|
|구매하기전에 정말 고민 많이했어...|
| 첫 캠핑이라 너무 설레서 고민고...|
|  캠핑 한번도 한적 없어요 설치 ...|
|  추석 이후 올줄 알았는데 완전 ...|
|이걸로 얼마전 태풍같이 바람불고...|
| 같은날 주문한 품목들중 일부 품...|
| 택배는 중간에 광복절 휴무가 끼...|
|   너무 좋네요 하자없는지 택배 ...|
| 여름에 잘 썼습니다 그런데 육각...|
| 제가 사용 해보니 간편하고 튼튼...|
|  일단 텐트가 아이둘 있는 인 가...|
|여름이라 편하게 다니려고 원터치...|
|간단하게 후기를 남기자면 텐트가...|
| 아이들도 크고 날씨도 좋아져서 ...|
|남편이 체력거지 게으름뱅이 귀차...|
| 그냥 저냥 문이 두개다 보미확실...|
|   캠프는 내취향이 아니야 라는 ...|
+----------------------------------+
only showing top 20 rows



### 저장

In [98]:
r.write.text('./tent-review')

In [65]:
# r.write.mode("overwrite").option("compression","bzip2").format("text").save("./tent-review.txt")

Py4JJavaError: An error occurred while calling o93.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:496)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:251)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:110)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:110)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:106)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:106)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:93)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:91)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:128)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:848)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:382)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:355)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
py4j.ClientServerConnection.run(ClientServerConnection.java:106)
java.lang.Thread.run(Thread.java:750)

The currently active SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
py4j.ClientServerConnection.run(ClientServerConnection.java:106)
java.lang.Thread.run(Thread.java:750)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:118)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1512)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:103)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:131)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:122)
	at org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:177)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:426)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:417)
	at org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:504)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:184)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:222)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:219)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:526)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:454)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:497)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:50)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:750)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:184)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:222)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:219)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:186)
	... 41 more


In [100]:
file_name = r"/home/ec2-user/notebook-work/miran/cleansing/tent-review/part-00000-ce1beab4-b9f2-4185-a763-d98fdfb3bbe2-c000.txt"

with open(file_name, 'r', encoding='utf-8') as f:
    review = f.readlines()

    print(review)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [107]:
reveiw = review[0]

# 형태소 분석

In [24]:
from konlpy.tag import Hannanum, Okt

In [106]:
tokenizer = Okt()
tokenizer.morphs(review[0])

['원래',
 '원',
 '터치',
 '텐트',
 '만',
 '구입',
 '하려고',
 '마음먹은',
 '사람',
 '입니다',
 '기존',
 '패스트',
 '캠프',
 '테라',
 '원',
 '터치',
 '텐트',
 '사용',
 '하다가',
 '좀',
 '작은',
 '사이즈',
 '도',
 '추가',
 '로',
 '구입',
 '하고',
 '싶어서',
 '오토',
 '로',
 '하나',
 '더',
 '구입',
 '했습니다',
 '손',
 '에',
 '익었을',
 '때',
 '펴는데',
 '분',
 '정도',
 '걸려요',
 '접는데도',
 '그',
 '정도',
 '는',
 '걸리는거',
 '같구요',
 '가방',
 '에',
 '넣었을',
 '때',
 '부피',
 '가',
 '작고',
 '가벼워서',
 '좋습니다',
 '미니',
 '멀',
 '캠핑',
 '이라',
 '구',
 '성품',
 '에',
 '있는',
 '번',
 '들',
 '팩',
 '으로도',
 '사용',
 '이',
 '충분하고',
 '추가',
 '팩',
 '구입',
 '할',
 '필요',
 '없을듯',
 '해',
 '요',
 '나무',
 '데크',
 '에서',
 '주로',
 '야영',
 '하니',
 '오징어',
 '팩',
 '만',
 '쓸',
 '때',
 '고',
 '있구요',
 '천고',
 '는',
 '테라',
 '가',
 '이어서',
 '엄청',
 '편했는데',
 '이번',
 '에',
 '산',
 '오토',
 '도',
 '그것',
 '보다는',
 '낮',
 '지만',
 '전혀',
 '불편',
 '감',
 '없네요',
 '레인',
 '플라이',
 '레인',
 '커버',
 '전용',
 '카페트',
 '그',
 '라운드',
 '시트',
 '익스',
 '텐션',
 '월',
 '추가',
 '구매',
 '했습니다',
 '타프',
 '가지',
 '고',
 '있는',
 '분',
 '은',
 '익스',
 '텐션',
 '월',
 '추가',
 '구매',
 '불필요할거',
 '같구요',
 '딱',
 '비',
 '피하

In [110]:
from koalanlp import API
from koalanlp.proc import Parser

In [112]:
from koalanlp.Util import initializeinitialize(hnn='LATEST')

SyntaxError: invalid syntax (<ipython-input-112-db412500eb88>, line 1)

In [111]:
parser = Parser(API.HNN)
parsed = parser(review[0])
print(parsed[0].getSyntaxTree().getTreeString())

Exception: 사용 전 초기화 과정이 필요합니다. 사용법의 Util.initialize 문서를 참고하여 초기화를 해주세요.사용하신 코드를 토대로는 다음 코드의 실행을 추천해드립니다.
from koalanlp.Util import initializeinitialize(hnn='LATEST')