In [425]:
import pyspark
import string
from pecab import PeCab
import re

In [426]:
# local[*] : 모든 코어를 사용하겠다, local[4] : 4개의 코어를 사용하겠다.

spark = SparkSession.builder\
        .master("local[*]")\
        .appName("WordCount")\
        .getOrCreate()

sc = spark.sparkContext

In [427]:
article_source = "./기사3.txt"
article_rdd = sc.textFile(article_source)
# article_rdd.take(100)

In [428]:
# 따옴표 안의 공백문자 제거하기
def rm_space_in_quote(x):
    encoded_string = x.encode('utf-8')
    decoded_string = encoded_string.decode('utf-8')
    no_space_string = string_without_spaces = re.sub(r"'[^']*'", lambda m: m.group().replace(" ", ""), decoded_string)
    return no_space_string

article_rdd = article_rdd.map(rm_space_in_quote)

In [429]:
# 사용자 정의사전 만들기
def make_user_dict(x):
    encoded_string = x.encode('utf-8')
    decoded_string = encoded_string.decode('utf-8')
    substrings = re.findall(r"'(.*?)'", decoded_string)
    return substrings

user_dict = ["뉴진스"]
user_words = article_rdd.map(make_user_dict)
non_empty_lists = user_words.filter(lambda x: len(x) > 0)
flattened_lists = non_empty_lists.flatMap(lambda x: x)
user_dict += flattened_lists.map(lambda x: x.replace(" ","")).collect()

In [430]:
# user_dict : 사용자 정의 단어
pecab = PeCab(user_dict=user_dict)

In [431]:
# 특수문자 제거
article_rdd = article_rdd.map(clean_str)

In [432]:
def clean_str(x):
    punc = string.punctuation
    for ch in punc:
        x = x.replace(ch, '')
    return x

In [433]:
# 단어로 나누기
# Return a new RDD by first applying a function to all elements of this RDD,
# and then flattening the results
article_rdd = article_rdd.flatMap(lambda article: pecab.nouns(article))

In [434]:
# 공백문자 없애기
article_rdd = article_rdd.filter(lambda x: x!='')

In [435]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

vect = CountVectorizer()
document_term_matrix = vect.fit_transform(list(article_rdd.toLocalIterator()))

tf = pd.DataFrame(document_term_matrix.toarray(), columns=vect.get_feature_names_out())

D = len(tf)
df = tf.astype(bool).sum(axis=0)
idf = np.log((D+1) / (df+1)) + 1             # IDF (Inverse Document Frequency)

# TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf = tf * idf                      
tfidf = tfidf / np.linalg.norm(tfidf, axis=1, keepdims=True)
tfidf.to_csv('output.csv', encoding = 'utf-8-sig')

In [None]:
# # 공백문자 없애기
# article_rdd = article_rdd.filter(lambda x: x!='')

# # 나왔던 단어 중복제거하며 저장
# article_count = article_rdd.map(lambda word:(word,1))

# # Reduce By Key
# # Key에 대해 count
# article_count_RBK = article_count.reduceByKey(lambda x,y: (x+y)).sortByKey()

# # (key,var) -> (var,key) 로 변경
# article_count_RBK = article_count_RBK.map(lambda x:(x[1],x[0]))

# #
# article_count_RBK.sortByKey(False).take(10)