In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
import os
class SparkSessionBases(object):
 
    SPARK_APP_NAME = None 
    SPARK_URL = 'yarn'
    SPARK_EXECUTOR_MEMORY = '2g' 
    SPARK_EXECUTOR_CORES = 2
    SPARK_EXECUTOR_INSTANCES = 2
    ENABLE_HIVE_SUPPORT = False

    def _create_spark_session(self):
        conf = SparkConf()
        config = (
            ('spark.app.name',self.SPARK_APP_NAME),
            ('spark.executor.memory',self.SPARK_EXECUTOR_MEMORY),
            ('spark.master',self.SPARK_URL),
            ('spark.executor.cores',self.SPARK_EXECUTOR_CORES),
            ('spark.executor.instances',self.SPARK_EXECUTOR_INSTANCES),
            )
        conf.setAll(config)
        if self.ENABLE_HIVE_SUPPORT:
            return SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
        else:
            return SparkSession.builder.config(conf=conf).getOrCreate()

In [2]:
import os
import sys
from pyspark.ml.feature import CountVectorizer,IDF
from pyspark.ml.feature import CountVectorizerModel

BASE_DIR = os.path.dirname(os.getcwd())
sys.path.insert(0,os.path.join(BASE_DIR))
PYSPARK_PYTHON = '/miniconda2/envs/reco_sys/bin/python'

os.environ['PYSPARK_PYTHON'] = PYSPARK_PYTHON
os.environ['PYSPARK_DRIVER_PYTHON'] = PYSPARK_PYTHON

# from config import SparkSessionBases
class KeywordsToTfidf(SparkSessionBases):
    SPARK_APP_NAME = 'keywordByTFIDF'
    SPARK_URL = 'spark://master:7077'
    ENABLE_HIVE_SUPPORT = True
    SPARK_EXECUTOR_MEMORY = '6g'
    SPARK_EXECUTOR_CORES = 4
    SPARK_EXECUTOR_INSTANCES = 8

    def __init__(self):
        self.spark = self._create_spark_session()

ktt = KeywordsToTfidf()


In [3]:
ktt.spark.sql('use article')


def segmentation(partition):
    import os
    import re
    import jieba
    import jieba.analyse
    import jieba.posseg
    import jieba.posseg as pseg
    import codecs

    abspath = '/root/words'
    userDict_path = os.path.join(abspath,'ITKeywords.txt')
    jieba.load_userdict(userDict_path)

    stopwords_path = os.path.join(abspath,"stopwords.txt")
    def get_stopwords_list():
        stopwords_list = [i.strip() for i  in codecs.open(stopwords_path).readline()]
        return stopwords_list
    stopwords_list = get_stopwords_list()

    def cut_sentence(sentence):
        seg_list = pseg.lcut(sentence)
        seg_list = [i for i in seg_list if i.flag not in stopwords_list]
        filtered_word_list = []
        for seg in seg_list:
            if len(seg.word) <= 1:
                continue
            elif seg.flag == 'eng':
                if len(seg.word) <= 2:
                    continue
                else:
                    filtered_word_list.append(seg.word)
            elif seg.flag.startswith('n'):
                filtered_word_list.append(seg.word)
            elif seg.flag in ['x','eng']:
                filtered_word_list.append(seg.word)
        return filtered_word_list

    for row in partition:
        sentence = re.sub("<.*?>","",row.sentence)
        words = cut_sentence(sentence)
        yield row.article_id,row.channel_id,words

article_dataframe = ktt.spark.sql('select * from article_data')
words_df = article_dataframe.rdd.mapPartitions(segmentation).toDF(['article_id','channel_id','words'])
# print(words_df.collect())

In [4]:
words_df

DataFrame[article_id: bigint, channel_id: bigint, words: array<string>]

In [5]:
# words_df.collect()

In [6]:
cv_model = CountVectorizerModel.load("hdfs://master:9000/headlines/model/CV.model")
print(cv_model)

CountVectorizer_49d688d848df461899b4


In [7]:
cv_result = cv_model.transform(words_df)

In [8]:
print(cv_result)

DataFrame[article_id: bigint, channel_id: bigint, words: array<string>, countFeatures: vector]


In [9]:
# from pyspark.ml.feature import IDF
# idf = IDF(inputCol="countFeatures", outputCol="idfFeatures")
# idfModel = idf.fit(cv_result)
# idfModel.write().overwrite().save("hdfs://master:9000/headlines/models/IDF.model")

In [10]:
from pyspark.ml.feature import IDFModel
idf_model = IDFModel.load("hdfs://master:9000/headlines/model/IDF.model")
print(idf_model)

IDF_46ba8959a2dc62a6a6f0


In [11]:
# view of cv_model
# print(type(cv_model))
# help(cv_model)
# cv_model.vocabulary

In [12]:
idf_model.idf.toArray()

array([ 1.41793052,  0.66523945,  0.80716005, ..., 11.14709445,
       11.14709445, 11.14709445])

In [13]:
keywords_list_with_idf = list(zip(cv_model.vocabulary, idf_model.idf.toArray()))

In [14]:
keywords_list_with_idf_1 = keywords_list_with_idf

In [15]:
print(type(keywords_list_with_idf_1))

<class 'list'>


In [16]:
# import spark
def func(data):
    for index in range(len(data)):
        data[index] = list(data[index])
        data[index].append(index)
        data[index][1] = float(data[index][1])
func(keywords_list_with_idf)
sc = ktt.spark.sparkContext
rdd = sc.parallelize(keywords_list_with_idf)
df = rdd.toDF(["keywords", "idf", "index"])

df.write.insertInto('idf_keywords_values')

In [17]:
tfidf_result = idf_model.transform(cv_result)
tfidf_result

DataFrame[article_id: bigint, channel_id: bigint, words: array<string>, countFeatures: vector, idfFeatures: vector]

In [18]:
cv_result.show(1)

+----------+----------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|
+----------+----------+--------------------+--------------------+
|         1|        17|[Vue, props, 用法, ...|(1234576,[1,2,3,5...|
+----------+----------+--------------------+--------------------+
only showing top 1 row



In [19]:
tfidf_result.show(1)

+----------+----------+--------------------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|         idfFeatures|
+----------+----------+--------------------+--------------------+--------------------+
|         1|        17|[Vue, props, 用法, ...|(1234576,[1,2,3,5...|(1234576,[1,2,3,5...|
+----------+----------+--------------------+--------------------+--------------------+
only showing top 1 row



In [20]:
# tfidf_result.describe(['channel_id']).show()

In [21]:
# tfidf_result.head(1)

In [22]:
tfidf_result.first()

Row(article_id=1, channel_id=17, words=['Vue', 'props', '用法', '小结', 'Vue', 'props', '用法', '组件', '选项', 'props', 'Vue', '选项', '父子', '组件', '关系', '总结', 'props', 'down', 'events', '组件', 'props', '传递数据', '组件', '组件', 'events', '组件', '发送消息', '父子', '组件', '组件', 'pa', 'rent', 'child', '组件', '环境', '书写', '组件', '可维护性', '定义', '父子', '组件', 'Vue', '对象', 'var', 'childNode', 'template', 'div', 'childNode', 'div', 'var', 'pa', 'rentNode', 'template', 'div', 'child', 'child', 'child', 'child', 'div', 'components', 'child', 'childNode', '全栈', '交流', 'Ian', '人员', '技术', '瓶颈', '思维能力', 'new', 'Vue', 'example', 'components', 'pa', 'rent', 'pa', 'rentNode', 'div', 'example', 'pa', 'rent', 'pa', 'rent', 'div', 'childNode', '定义', 'template', 'div', '内容', 'childNode', '字符串', 'pa', 'rentNode', 'template', '定义', 'div', 'class', 'pa', 'rent', 'child', '组件', '静态', 'props', '组件', '实例', '作用域', '组件', '模板', '饮用', '组件', '数据', '组件', '组件', '数据', '组件', 'props', '选项', '组件', '向子', '组件', '传递数据', '方式', '动态', '静态', '静态', '方式', '组件', '

In [23]:
tfidf_result.select('idfFeatures').show(1)

+--------------------+
|         idfFeatures|
+--------------------+
|(1234576,[1,2,3,5...|
+--------------------+
only showing top 1 row



In [31]:
def func(partition):
    TOPK=20
    for row in partition:
        _ = list(zip(row.idfFeatures.indices,row.idfFeatures.values))
        _ = sorted(_,key=lambda x:x[1],reverse=True)
        result = _[:TOPK]
        for word_index, tfidf in result:
            yield row.article_id,row.channel_id ,int(word_index),round(float(tfidf),4)
_keywordsByTFIDF = tfidf_result.rdd.mapPartitions(func).toDF(['article_id','channel_id','index','tfidf'])
# _keywordsByTFIDF = tfidf_result.rdd.mapPartitions(func).toDF(['article_id','channel_id','index','tfidf'])

In [28]:
tfidf_result.limit(1).printSchema()

root
 |-- article_id: long (nullable = true)
 |-- channel_id: long (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- countFeatures: vector (nullable = true)
 |-- idfFeatures: vector (nullable = true)



In [32]:
_keywordsByTFIDF.show(10)

+----------+----------+------+--------+
|article_id|channel_id| index|   tfidf|
+----------+----------+------+--------+
|         1|        17| 98468|256.3832|
|         1|        17|118420|200.6477|
|         1|        17|   515|174.1587|
|         1|        17| 20162|141.6591|
|         1|        17|   591|126.9927|
|         1|        17|    62|125.4204|
|         1|        17|    45|101.2493|
|         1|        17|   391| 85.5751|
|         1|        17|   492| 56.6672|
|         1|        17|  5647| 51.6391|
+----------+----------+------+--------+
only showing top 10 rows



In [26]:
# from pyspark.sql.types import StructField,StringType,FloatType,StructType
# import time
# start = time.time()
# tfidf_result.limit(100).rdd.map(lambda x:(x[0],x[1],x[2])).toDF(['a','b','c']).show()
# end = time.time()
# print(end-start)

+---+---+--------------------+
|  a|  b|                   c|
+---+---+--------------------+
|  1| 17|[Vue, props, 用法, ...|
|  2| 17|[vue, 响应式, 原理, mo...|
|  3| 17|[JavaScript, 浅拷贝,...|
|  4| 17|[vue2, vuex, elem...|
|  5| 17|[immutability, Re...|
|  6| 17|[node, npm, cnpm,...|
|  7| 17|[Web, 工程师, 以太坊, 入...|
|  8| 17|[Web, pa, api, we...|
|  9| 17|[vue, 中用, 数据驱动, 视...|
| 10| 17|[程序, WebSocket, 长...|
| 11| 17|[flux, 架构, flux, ...|
| 12| 17|[合格, TypeScript, ...|
| 13| 17|[专属, 插件, Easy, Sl...|
| 14| 17|[前后端分离, vue, 网站前台...|
| 15| 17|[ajax, 页面, 重复提交, ...|
| 17| 17|[JSsearch, 购物网站, ...|
| 18| 17|[web, pa, react, ...|
| 19| 17|[合格, 事顶, 项目, 自我介绍...|
| 20| 17|[总结, jQuery, 用法, ...|
| 21| 17|[Bootstrap, Modal...|
+---+---+--------------------+
only showing top 20 rows

164.63382411003113


In [38]:
keywordsIndex = ktt.spark.sql('select keyword,index idx from idf_keywords_values')
keywordsByTFIDF = _keywordsByTFIDF.join(keywordsIndex,keywordsIndex.idx ==_keywordsByTFIDF.index).select(['article_id','channel_id','keyword','tfidf'])

In [39]:
keywordsByTFIDF.show(1)

+----------+----------+-------+-------+
|article_id|channel_id|keyword|  tfidf|
+----------+----------+-------+-------+
|         3|        17|    var|22.4825|
+----------+----------+-------+-------+
only showing top 1 row



In [40]:
keywordsByTFIDF.write.insertInto('tfidf_keywords_values')

In [52]:
def textrank(partition):
    import os
    import jieba
    import jieba.analyse
    import jieba.posseg as pesg
    import codecs
    abspath = '/root/words'
    
    userDict_path = os.path.join(abspath,'ITKeywords.txt')
    jieba.load_userdict(userDict_path)
    
    stopwords_path = os.path.join(abspath,'stopwords.txt')
    
    def get_stopwords_list():
        stopwords_list = [i.strip() for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list
    stopwords_list = get_stopwords_list()
    
    class TextRank(jieba.analyse.TextRank):
        def __init__(self,window=20,word_min_len=2):
            super(TextRank,self).__init__()
            self.span = window
            self.word_min_len = word_min_len
            self.pos_filt = frozenset(('n', 'x', 'eng', 'f', 's', 't', 'nr', 'ns', 'nt', "nw", "nz", "PER", "LOC", "ORG"))
        def pairfilter(self,wp):
            if wp.flag == 'eng':
                if len(wp.word) <= 2:
                    return False
            if wp.flag in self.pos_filt and len(wp.word.strip()) >= self.word_min_len and wp.word.lower() not in stopwords_list:
                return True
    textrank_model = TextRank(window=5,word_min_len=2)
    allowPOS = ('n', "x", 'eng', 'nr', 'ns', 'nt', "nw", "nz", "c")
    
    for row in partition:
        tags = textrank_model.textrank(row.sentence,topK=20,withWeight=True,allowPOS=allowPOS,withFlag=False)
        for tag in tags:
            yield row.article_id,row.channel_id,tag[0],tag[1]

In [53]:
textrank_keywords_df = article_dataframe.rdd.mapPartitions(textrank).toDF(['article_id','channel_id','keyword','textrank'])

In [54]:
textrank_keywords_df.write.insertInto('textrank_keywords_values')

In [55]:
idf = ktt.spark.sql('select * from idf_keywords_values')
idf = idf.withColumnRenamed('keyword','keyword1')
result = textrank_keywords_df.join(idf,textrank_keywords_df.keyword == idf.keyword1)
keywords_res = result.withColumn('weights',result.textrank*result.idf).select(['article_id','channel_id','keyword','weights'])


In [64]:
keywords_res.registerTempTable('temptable')
merge_keywords = ktt.spark.sql('select article_id,min(channel_id) channel_id,collect_list(keyword) keywords,collect_list(weights) weights from temptable group by article_id')


In [58]:
def _func(row):
    return row.article_id,row.channel_id,dict(zip(row.keywords,row.weights))
keywords_info = merge_keywords.rdd.map(_func).toDF(['article_id','channel_id','keywords'])


In [62]:
topic_sql = """select t.article_id article_id2,collect_set(t.keywords) topics from tfidf_keywords_values t inner join textrank_keywords_values r where t.keywords=r.keyword group by article_id2"""
article_topics = ktt.spark.sql(topic_sql)



In [66]:
article_profile = keywords_info.join(article_topics,keywords_info.article_id==article_topics.article_id2).select(['article_id','channel_id','keywords','topics'])


In [None]:
articleprofile.write.insertInto('article_profile')