In [1]:
import os
import sys 

os.environ["PYLIB"]=os.path.join(os.environ["SPARK_HOME"],'python','lib')
sys.path.insert(0,os.path.join(os.environ["PYLIB"],'py4j-0.10.1-src.zip'))
sys.path.insert(0,os.path.join(os.environ["PYLIB"],'pyspark.zip'))

import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder\
    .master("local")\
    .appName("myApp")\
    .config("spark.sql.warehouse.dir", "C:/Users/JSM")\
    .getOrCreate()

# Spark를 이용한 텍스트 변환

In [2]:


doc2d=[
    ["When I find myself in times of trouble"],
    ["Mother Mary comes to me"],
    ["Speaking words of wisdom, let it be"],
    ["And in my hour of darkness"],
    ["She is standing right in front of me"],
    ["Speaking words of wisdom, let it be"],
    [u"우리 Let it be"],
    [u"나 Let it be"],
    [u"너 Let it be"],
    ["Let it be"],
    ["Whisper words of wisdom, let it be"]
]

In [5]:
myDf = spark.createDataFrame(doc2d, ['sent'])
myDf.show(truncate=True)

+--------------------+
|                sent|
+--------------------+
|When I find mysel...|
|Mother Mary comes...|
|Speaking words of...|
|And in my hour of...|
|She is standing r...|
|Speaking words of...|
|        우리 Let it be|
|         나 Let it be|
|         너 Let it be|
|           Let it be|
|Whisper words of ...|
+--------------------+



In [8]:
#Tokenizer : 단어로 쪼개기
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol="sent", outputCol="words") #토크나이저 객체 생성

tokDf = tokenizer.transform(myDf) #토크나이저 함수를 DF에 적용(transform)
tokDf.show(3, truncate = False)

+--------------------------------------+-----------------------------------------------+
|sent                                  |words                                          |
+--------------------------------------+-----------------------------------------------+
|When I find myself in times of trouble|[when, i, find, myself, in, times, of, trouble]|
|Mother Mary comes to me               |[mother, mary, comes, to, me]                  |
|Speaking words of wisdom, let it be   |[speaking, words, of, wisdom,, let, it, be]    |
+--------------------------------------+-----------------------------------------------+
only showing top 3 rows



In [9]:
for r in tokDf.select("sent", "words").take(3):
    print r

Row(sent=u'When I find myself in times of trouble', words=[u'when', u'i', u'find', u'myself', u'in', u'times', u'of', u'trouble'])
Row(sent=u'Mother Mary comes to me', words=[u'mother', u'mary', u'comes', u'to', u'me'])
Row(sent=u'Speaking words of wisdom, let it be', words=[u'speaking', u'words', u'of', u'wisdom,', u'let', u'it', u'be'])


In [15]:
#Stopwords : 불용어 처리 기구
from pyspark.ml.feature import StopWordsRemover
stop = StopWordsRemover(inputCol="words", outputCol="nostops")

stopwords=list() #기존에 있는 스탑워드를 더한다
_stopwords=stop.getStopWords()
for e in _stopwords:
    stopwords.append(e)
    
_mystopwords=[u"가", u"는", u"우리", u"나", u"너"]
for e in _mystopwords:
        stopwords.append(e)
stop.setStopWords(stopwords) #stop에 stopwords를 입력

for e in stop.getStopWords():
    print e,

i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now d ll m o re ve y ain aren couldn didn doesn hadn hasn haven isn ma mightn mustn needn shan shouldn wasn weren won wouldn 가 는 우리 나 너


In [16]:
stopDf = stop.transform(tokDf)
stopDf.show()

+--------------------+--------------------+--------------------+
|                sent|               words|             nostops|
+--------------------+--------------------+--------------------+
|When I find mysel...|[when, i, find, m...|[find, times, tro...|
|Mother Mary comes...|[mother, mary, co...|[mother, mary, co...|
|Speaking words of...|[speaking, words,...|[speaking, words,...|
|And in my hour of...|[and, in, my, hou...|    [hour, darkness]|
|She is standing r...|[she, is, standin...|[standing, right,...|
|Speaking words of...|[speaking, words,...|[speaking, words,...|
|        우리 Let it be|   [우리, let, it, be]|               [let]|
|         나 Let it be|    [나, let, it, be]|               [let]|
|         너 Let it be|    [너, let, it, be]|               [let]|
|           Let it be|       [let, it, be]|               [let]|
|Whisper words of ...|[whisper, words, ...|[whisper, words, ...|
+--------------------+--------------------+--------------------+



In [18]:
#countVectorizer
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol = "nostops", outputCol = "cv", vocabSize =30, minDF=1.0 ) 
#vocabSize : 단어 갯수, minDF: 단어가 문서에 사용된 비율
cvModel =cv.fit(stopDf) #CVmodel 생성(Estimator)


cvDf = cvModel.transform(stopDf)
cvDf.show(3)

cvDf.select('sent', 'nostops', 'cv').show()

+--------------------+--------------------+--------------------+--------------------+
|                sent|               words|             nostops|                  cv|
+--------------------+--------------------+--------------------+--------------------+
|When I find mysel...|[when, i, find, m...|[find, times, tro...|(16,[5,6,8],[1.0,...|
|Mother Mary comes...|[mother, mary, co...|[mother, mary, co...|(16,[10,13,14],[1...|
|Speaking words of...|[speaking, words,...|[speaking, words,...|(16,[0,1,2,3],[1....|
+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows

+--------------------+--------------------+--------------------+
|                sent|             nostops|                  cv|
+--------------------+--------------------+--------------------+
|When I find mysel...|[find, times, tro...|(16,[5,6,8],[1.0,...|
|Mother Mary comes...|[mother, mary, co...|(16,[10,13,14],[1...|
|Speaking words of...|[speaking, words,...|(16,[

In [20]:
#TF-IDF
from pyspark.ml.feature import HashingTF, IDF

hashTF = HashingTF(inputCol = "nostops", outputCol = "hash", numFeatures=50)

hashDf = hashTF.transform(stopDf)

hashDf.select('sent', 'nostops', 'hash').show() #hashTF를 구함

+--------------------+--------------------+--------------------+
|                sent|             nostops|                hash|
+--------------------+--------------------+--------------------+
|When I find mysel...|[find, times, tro...|(50,[10,24,43],[1...|
|Mother Mary comes...|[mother, mary, co...|(50,[1,21,24],[1....|
|Speaking words of...|[speaking, words,...|(50,[9,12,14,41],...|
|And in my hour of...|    [hour, darkness]|(50,[23,27],[1.0,...|
|She is standing r...|[standing, right,...|(50,[24,43,46],[1...|
|Speaking words of...|[speaking, words,...|(50,[9,12,14,41],...|
|        우리 Let it be|               [let]|     (50,[14],[1.0])|
|         나 Let it be|               [let]|     (50,[14],[1.0])|
|         너 Let it be|               [let]|     (50,[14],[1.0])|
|           Let it be|               [let]|     (50,[14],[1.0])|
|Whisper words of ...|[whisper, words, ...|(50,[9,14,15,41],...|
+--------------------+--------------------+--------------------+



In [26]:
idf =IDF(inputCol = "hash", outputCol = "idf")
idfModel = idf.fit(hashDf)
idfDf = idfModel.transform(hashDf)

idfDf.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                sent|               words|             nostops|                hash|                 idf|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|When I find mysel...|[when, i, find, m...|[find, times, tro...|(50,[10,24,43],[1...|(50,[10,24,43],[1...|
|Mother Mary comes...|[mother, mary, co...|[mother, mary, co...|(50,[1,21,24],[1....|(50,[1,21,24],[1....|
|Speaking words of...|[speaking, words,...|[speaking, words,...|(50,[9,12,14,41],...|(50,[9,12,14,41],...|
|And in my hour of...|[and, in, my, hou...|    [hour, darkness]|(50,[23,27],[1.0,...|(50,[23,27],[1.79...|
|She is standing r...|[she, is, standin...|[standing, right,...|(50,[24,43,46],[1...|(50,[24,43,46],[1...|
|Speaking words of...|[speaking, words,...|[speaking, words,...|(50,[9,12,14,41],...|(50,[9,12,14,41],...|
|        우리 Let it be|   [우리, let, it

In [27]:
for e in idfDf.select("nostops","hash").take(10):
    print(e)

Row(nostops=[u'find', u'times', u'trouble'], hash=SparseVector(50, {10: 1.0, 24: 1.0, 43: 1.0}))
Row(nostops=[u'mother', u'mary', u'comes'], hash=SparseVector(50, {1: 1.0, 21: 1.0, 24: 1.0}))
Row(nostops=[u'speaking', u'words', u'wisdom,', u'let'], hash=SparseVector(50, {9: 1.0, 12: 1.0, 14: 1.0, 41: 1.0}))
Row(nostops=[u'hour', u'darkness'], hash=SparseVector(50, {23: 1.0, 27: 1.0}))
Row(nostops=[u'standing', u'right', u'front'], hash=SparseVector(50, {24: 1.0, 43: 1.0, 46: 1.0}))
Row(nostops=[u'speaking', u'words', u'wisdom,', u'let'], hash=SparseVector(50, {9: 1.0, 12: 1.0, 14: 1.0, 41: 1.0}))
Row(nostops=[u'let'], hash=SparseVector(50, {14: 1.0}))
Row(nostops=[u'let'], hash=SparseVector(50, {14: 1.0}))
Row(nostops=[u'let'], hash=SparseVector(50, {14: 1.0}))
Row(nostops=[u'let'], hash=SparseVector(50, {14: 1.0}))
