In [1]:
import os
import sys

os.environ["PYLIB"]=os.path.join(os.environ["SPARK_HOME"],'python','lib')
sys.path.insert(0,os.path.join(os.environ["PYLIB"],'py4j-0.10.1-src.zip'))
sys.path.insert(0,os.path.join(os.environ["PYLIB"],'pyspark.zip'))

import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder\
    .master("local")\
    .appName("myApp")\
    .config("spark.sql.warehouse.dir", "C:/Users/G312")\
    .getOrCreate()

from pyspark.sql.types import StructType, StructField, StringType
police=spark.read.option("header","true")\
    .schema(
        StructType([
            StructField("sent",StringType()),
            ])
    )\
    .text(os.path.join("data", "20191021_policeAddress.txt")) 
police.show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------+
|sent                                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------+
|존경하는 국민 여러분, 경찰관 여러분, 일흔네 돌 ‘경찰의 날’입니다.                                                                                              |
|                                                                                                                                     |
|국민의 안전을 위해 밤낮없이 애쓰시는 전국의 15만 경찰관 여러분께 먼저 감사를 드립니다. 전몰·순직 경찰관들의 고귀한 희생에 경의를 표합니다. 유가족 여러분께 위로의 마음을 전합니다.                              |
|                                                                                                                                     |
|오늘 홍조근정훈장을 받으신 중앙경찰학교장 이은정 치안감님, 근정포장을 받으신 광주남부

In [2]:
from pyspark.ml.feature import StringIndexer
labelIndexer = StringIndexer(inputCol="sent", outputCol="sentLabel")
model=labelIndexer.fit(police)
siDf=model.transform(police)
siDf.show()

+--------------------+---------+
|                sent|sentLabel|
+--------------------+---------+
|존경하는 국민 여러분, 경찰관 ...|     15.0|
|                    |      0.0|
|국민의 안전을 위해 밤낮없이 애...|     20.0|
|                    |      0.0|
|오늘 홍조근정훈장을 받으신 중앙...|     18.0|
|                    |      0.0|
|       사랑하는 경찰관 여러분,|      3.0|
|                    |      0.0|
|여러분의 헌신적 노력으로 우리의...|      9.0|
|                    |      0.0|
|치안의 개선은 국민의 체감으로 ...|     22.0|
|                    |      0.0|
|한국을 찾는 외국 관광객들도 우...|     13.0|
|                    |      0.0|
|올해는 ‘경찰의 날’에 맞춰 국...|      6.0|
|                    |      0.0|
|      자랑스러운 경찰관 여러분,|     16.0|
|                    |      0.0|
|경찰헌장은 “나라와 겨레를 위하...|     10.0|
|                    |      0.0|
+--------------------+---------+
only showing top 20 rows



In [3]:
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol="sent", outputCol="words")
tokDf = tokenizer.transform(police)
for r in tokDf.select("sent", "words").take(3):
    print r

Row(sent=u'\uc874\uacbd\ud558\ub294 \uad6d\ubbfc \uc5ec\ub7ec\ubd84, \uacbd\ucc30\uad00 \uc5ec\ub7ec\ubd84, \uc77c\ud754\ub124 \ub3cc \u2018\uacbd\ucc30\uc758 \ub0a0\u2019\uc785\ub2c8\ub2e4.', words=[u'\uc874\uacbd\ud558\ub294', u'\uad6d\ubbfc', u'\uc5ec\ub7ec\ubd84,', u'\uacbd\ucc30\uad00', u'\uc5ec\ub7ec\ubd84,', u'\uc77c\ud754\ub124', u'\ub3cc', u'\u2018\uacbd\ucc30\uc758', u'\ub0a0\u2019\uc785\ub2c8\ub2e4.'])
Row(sent=u' ', words=[])
Row(sent=u'\uad6d\ubbfc\uc758 \uc548\uc804\uc744 \uc704\ud574 \ubc24\ub0ae\uc5c6\uc774 \uc560\uc4f0\uc2dc\ub294 \uc804\uad6d\uc758 15\ub9cc \uacbd\ucc30\uad00 \uc5ec\ub7ec\ubd84\uaed8 \uba3c\uc800 \uac10\uc0ac\ub97c \ub4dc\ub9bd\ub2c8\ub2e4. \uc804\ubab0\xb7\uc21c\uc9c1 \uacbd\ucc30\uad00\ub4e4\uc758 \uace0\uadc0\ud55c \ud76c\uc0dd\uc5d0 \uacbd\uc758\ub97c \ud45c\ud569\ub2c8\ub2e4. \uc720\uac00\uc871 \uc5ec\ub7ec\ubd84\uaed8 \uc704\ub85c\uc758 \ub9c8\uc74c\uc744 \uc804\ud569\ub2c8\ub2e4.', words=[u'\uad6d\ubbfc\uc758', u'\uc548\uc804\uc744', u'\uc704\u

In [27]:
from pyspark.ml.feature import StopWordsRemover
stop = StopWordsRemover(inputCol="words", outputCol="nostops")

stopwords=list()
_stopwords=stop.getStopWords()
for e in _stopwords:
    stopwords.append(e)

_mystopwords=[u'돌', u'참', u'날', u'더', u'그', u'등', u'큰', u'될', u'있습니다', u'드립니다']
for e in _mystopwords:
    stopwords.append(e)
stop.setStopWords(stopwords)

stopDf=stop.transform(tokDf)
stopDf.show(truncate =False)

+-------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|sent                                                                                                                                 |words                                                                                                                                                            |nostops                                                                                                                                                          |
+-------------------------------------------------------------------------------

+-------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|sent                                                                                                                                 |words                                                                                                                                                            |nostops                                                                                                                                                          |
+-------------------------------------------------------------------------------

In [26]:
from pyspark.ml.feature import HashingTF, IDF

hashTF = HashingTF(inputCol="nostops", outputCol="hash", numFeatures=50)
hashDf = hashTF.transform(stopDf)
idf = IDF(inputCol="hash", outputCol="idf")
idfModel = idf.fit(hashDf)
idfDf = idfModel.transform(hashDf)
    
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

va = VectorAssembler(inputCols=["idf"],outputCol="features")
vaDf = va.transform(idfDf)
vaDf.printSchema()
vaDf.show(truncate=False)

root
 |-- sent: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- nostops: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hash: vector (nullable = true)
 |-- idf: vector (nullable = true)
 |-- features: vector (nullable = true)

+-------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------

In [29]:
#추가적으로 cv를 features로 설정하였습니다.

from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="nostops", outputCol="cv",
    vocabSize=60,minDF=1.0)
cvModel = cv.fit(stopDf)
cvDf = cvModel.transform(stopDf)
    
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

va = VectorAssembler(inputCols=["cv"],outputCol="features")
vaDf = va.transform(cvDf)
vaDf.printSchema()
vaDf.show(truncate=False)

root
 |-- sent: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- nostops: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cv: vector (nullable = true)
 |-- features: vector (nullable = true)

+-------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------+----------------------------------------------------------------------------+
|sent                                                                                               