In [1]:
import os
import sys 

os.environ["PYLIB"]=os.path.join(os.environ["SPARK_HOME"],'python','lib')
sys.path.insert(0,os.path.join(os.environ["PYLIB"],'py4j-0.10.1-src.zip'))
sys.path.insert(0,os.path.join(os.environ["PYLIB"],'pyspark.zip'))

import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder\
    .master("local")\
    .appName("myApp")\
    .config("spark.sql.warehouse.dir", "C:/Users/JSM")\
    .getOrCreate()

In [8]:
from pyspark.sql.types import StructType, StructField, StringType

policeDf = spark.read\
    .option("header", "true")\
    .option("delimiter", " ")\
    .option("inferSchema","true")\
    .schema(
        StructType([
            StructField("sent", StringType()),
        ])
    )\
    .text(os.path.join("data", "policeAddress.txt"))

#모두 보고 싶으면 false

In [12]:
policeDf.show(1, False)

+---------------------------------------+
|sent                                   |
+---------------------------------------+
|존경하는 국민 여러분, 경찰관 여러분, 일흔네 돌 ‘경찰의 날’입니다.|
+---------------------------------------+
only showing top 1 row



In [15]:
from pyspark.ml.feature import Tokenizer

pToken = Tokenizer(inputCol = "sent", outputCol = "words")
toDf = pToken.transform(policeDf)
toDf.show(2)

+--------------------+--------------------+
|                sent|               words|
+--------------------+--------------------+
|존경하는 국민 여러분, 경찰관 ...|[존경하는, 국민, 여러분,, ...|
|                    |                  []|
+--------------------+--------------------+
only showing top 2 rows



In [18]:
from pyspark.ml.feature import StopWordsRemover

pStop = StopWordsRemover(inputCol = "words", outputCol = "nostopP")

stopwords = []
pStop.setStopWords([u"나", u"우리", u"너"])

_myStopWords = [u"돌",u"등",u"이미"]
for e in _myStopWords:
    stopwords.append(e)
pStop.setStopWords(stopwords)

StopWordsRemover_47d395e84c41a482d3c4

In [20]:
stopDf=pStop.transform(toDf)
stopDf.show(7)

+--------------------+--------------------+--------------------+
|                sent|               words|             nostopP|
+--------------------+--------------------+--------------------+
|존경하는 국민 여러분, 경찰관 ...|[존경하는, 국민, 여러분,, ...|[존경하는, 국민, 여러분,, ...|
|                    |                  []|                  []|
|국민의 안전을 위해 밤낮없이 애...|[국민의, 안전을, 위해, 밤낮...|[국민의, 안전을, 위해, 밤낮...|
|                    |                  []|                  []|
|오늘 홍조근정훈장을 받으신 중앙...|[오늘, 홍조근정훈장을, 받으신...|[오늘, 홍조근정훈장을, 받으신...|
|                    |                  []|                  []|
|       사랑하는 경찰관 여러분,|   [사랑하는, 경찰관, 여러분,]|   [사랑하는, 경찰관, 여러분,]|
+--------------------+--------------------+--------------------+
only showing top 7 rows



In [21]:
for r in stopDf.select("nostopP").take(3):
    for e in r:
        print e

[u'\uc874\uacbd\ud558\ub294', u'\uad6d\ubbfc', u'\uc5ec\ub7ec\ubd84,', u'\uacbd\ucc30\uad00', u'\uc5ec\ub7ec\ubd84,', u'\uc77c\ud754\ub124', u'\u2018\uacbd\ucc30\uc758', u'\ub0a0\u2019\uc785\ub2c8\ub2e4.']
[]
[u'\uad6d\ubbfc\uc758', u'\uc548\uc804\uc744', u'\uc704\ud574', u'\ubc24\ub0ae\uc5c6\uc774', u'\uc560\uc4f0\uc2dc\ub294', u'\uc804\uad6d\uc758', u'15\ub9cc', u'\uacbd\ucc30\uad00', u'\uc5ec\ub7ec\ubd84\uaed8', u'\uba3c\uc800', u'\uac10\uc0ac\ub97c', u'\ub4dc\ub9bd\ub2c8\ub2e4.', u'\uc804\ubab0\xb7\uc21c\uc9c1', u'\uacbd\ucc30\uad00\ub4e4\uc758', u'\uace0\uadc0\ud55c', u'\ud76c\uc0dd\uc5d0', u'\uacbd\uc758\ub97c', u'\ud45c\ud569\ub2c8\ub2e4.', u'\uc720\uac00\uc871', u'\uc5ec\ub7ec\ubd84\uaed8', u'\uc704\ub85c\uc758', u'\ub9c8\uc74c\uc744', u'\uc804\ud569\ub2c8\ub2e4.']


In [25]:
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF

pHash =  HashingTF(inputCol = "nostopP", outputCol = "hashP", numFeatures = 50)
hashDf = pHash.transform(stopDf)


+---------------------------------------+-------------------------------------------------+----------------------------------------------+--------------------------------------------------------+
|sent                                   |words                                            |nostopP                                       |hashP                                                   |
+---------------------------------------+-------------------------------------------------+----------------------------------------------+--------------------------------------------------------+
|존경하는 국민 여러분, 경찰관 여러분, 일흔네 돌 ‘경찰의 날’입니다.|[존경하는, 국민, 여러분,, 경찰관, 여러분,, 일흔네, 돌, ‘경찰의, 날’입니다.]|[존경하는, 국민, 여러분,, 경찰관, 여러분,, 일흔네, ‘경찰의, 날’입니다.]|(50,[9,15,17,23,36,39,46],[2.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|                                       |[]                                               |[]                                            |(50,[],[])                                              |
+-------------------

In [34]:
pIDF = IDF(inputCol = "hashP", outputCol = "IDF_P")
pModel = pIDF.fit(hashDf)
pDf = pModel.transform(hashDf)

pDf.show(4, True)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                sent|               words|             nostopP|               hashP|               IDF_P|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|존경하는 국민 여러분, 경찰관 ...|[존경하는, 국민, 여러분,, ...|[존경하는, 국민, 여러분,, ...|(50,[9,15,17,23,3...|(50,[9,15,17,23,3...|
|                    |                  []|                  []|          (50,[],[])|          (50,[],[])|
|국민의 안전을 위해 밤낮없이 애...|[국민의, 안전을, 위해, 밤낮...|[국민의, 안전을, 위해, 밤낮...|(50,[0,1,3,4,7,12...|(50,[0,1,3,4,7,12...|
|                    |                  []|                  []|          (50,[],[])|          (50,[],[])|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 4 rows

