In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, pandas_udf, split, lower, udf
from pyspark.sql import functions as F
from pyspark.sql.types import LongType, StructType, StructField, IntegerType, StringType, DoubleType

In [3]:
conf = SparkConf()

spark = SparkSession.builder.config(conf=conf).appName("Spark cossim text books").getOrCreate()

In [4]:
spark

## Постановка задачи
Имея текстовые статьи найти 10 похожих по содержанию для
courses_list = [16871, 12234, 9029, 18030, 8170, 1200]


## Грузим датасеты

In [5]:
# наш датасет в HDFS 66.3 мб
!hdfs dfs -ls -h /labs/lab07data/DO_record_per_line.json

-rw-r--r--   2 hdfs hdfs     66.3 M 2020-09-30 12:22 /labs/lab07data/DO_record_per_line.json


In [6]:
!hdfs dfs -cat /labs/lab07data/DO_record_per_line.json | head -n1

{"lang": "en", "name": "Accounting Cycle: The Foundation of Business Measurement and Reporting", "cat": "3/business_management|6/economics_finance", "provider": "Canvas Network", "id": 4, "desc": "This course introduces the basic financial statements used by most businesses, as well as the essential tools used to prepare them. This course will serve as a resource to help business students succeed in their upcoming university-level accounting classes, and as a refresher for upper division accounting students who are struggling to recall elementary concepts essential to more advanced accounting topics. Business owners will also benefit from this class by gaining essential skills necessary to organize and manage information pertinent to operating their business. At the conclusion of the class, students will understand the balance sheet, income statement, and cash flow statement. They will be able to differentiate between cash basis and accrual basis techniques, and know when each is appro

In [7]:
df_schema = StructType(fields=[
    StructField("lang", StringType()),
    StructField("name", StringType()),
    StructField("cat", StringType()),
    StructField("provider", StringType()),
    StructField("id", IntegerType()),
    StructField("desc", StringType()),
])

In [8]:
df_dir = '/labs/lab07data/DO_record_per_line.json'

#  наши таргеты
courses_list = [16871, 12234, 9029, 18030, 8170, 1200]


# «Похожесть» – в данном случае синоним «корреляции» интересов и может считаться множеством 
# способов (помимо корреляции Пирсона, есть еще косинусное расстояние, 
# есть расстояние Жаккара, расстояние Хэмминга и пр.)

# Будем использовать косинусное расстояние (косинусную близость)

In [9]:
df = spark.read.json(df_dir, schema=df_schema).cache()
df.printSchema()

root
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- cat: string (nullable = true)
 |-- provider: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- desc: string (nullable = true)



In [10]:
# df.show(2, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Попробуем для content-based similarity  HashingTF, CountVectorizer

In [11]:
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, StopWordsRemover
# from pyspark.ml.feature import StopWordsRemover

In [12]:
%%time
# самодельный токенайзер, сильно не мудрил, можно было использовать RegexTokenizer - было бы проще
corpus_regexp = r'[\t|\n|\$|\:|\*|\!|\?|\#|\:|\/|\\|,|\.|\(|\)|\[|\]|\{|\}|\"|\'|\$|\-|\+|\”|\“|\%|\¡|\¿|\&|\;|\s]'
df = df.withColumn('desc_tmp', lower(F.col('desc')))
df = df.withColumn('desc_tmp', F.split(F.col('desc_tmp'), corpus_regexp))
remover = StopWordsRemover(inputCol="desc_tmp", outputCol="corpus", stopWords=[""])
df = remover.transform(df)
# df = df['id', 'name', 'corpus']
df = df['id', 'name', 'lang', 'desc', 'corpus']
# df.show(3, vertical=True, truncate=False)

CPU times: user 46.3 ms, sys: 13.2 ms, total: 59.5 ms
Wall time: 623 ms


In [13]:
%%time
# векторайзер. Так как объем данных не большой и ресурсы позволяют, берем обычный CountVectorizer
# но можно и hashingTF использовать, он быстрее.

# hashingTF = HashingTF(inputCol="corpus", outputCol="tf", numFeatures=10000)
# tf = hashingTF.transform(df)

countTF = CountVectorizer(inputCol="corpus", outputCol="tf").fit(df)
tf = countTF.transform(df)

CPU times: user 25.3 ms, sys: 9 µs, total: 25.3 ms
Wall time: 19.5 s


In [14]:
# tf.show(3, vertical=True, truncate=False)

In [15]:
%%time
# idf
idf = IDF(inputCol="tf", outputCol="raw_feature").fit(tf)
tfidf = idf.transform(tf)

CPU times: user 11.1 ms, sys: 214 µs, total: 11.3 ms
Wall time: 14.6 s


In [16]:
# tfidf.show(3, vertical=True, truncate=False)

In [17]:
%%time
# нормализуем
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="raw_feature", outputCol="feature")
data = normalizer.transform(tfidf)

CPU times: user 5.84 ms, sys: 956 µs, total: 6.8 ms
Wall time: 21.8 ms


In [18]:
%%time
# Получили фичи
data.select('id', 'feature').show(3, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Считаем косинусную близость для текстов по  спарс векторам, в обычной жизни это очень долго для прода. Но для холодного старта может подойти.

In [20]:
%%time
# задали udf и матрицу cos_sim, лениво вычисления будут ниже...
@udf
def sim_cos(v1,v2):
    try:
        p = 2
        return float(v1.dot(v2))/float(v1.norm(p)*v2.norm(p))
    except:
        return 0

# строим лениво квадратную матрицу таргет * на все id
# очень дорогая операция если делать за раз... Но т.к. размер малый то можно :)
cossimmatrix = data.alias("i").join(data.alias("j"), F.col("i.id") != F.col("j.id"))\
    .select(
        F.col("i.id").alias("id"),
        F.col("i.name").alias("i_name"),
        F.col("i.lang").alias("i_lang"),
        F.col("i.desc").alias("i_desc"),
        F.col("i.id").alias("i"), 
        F.col("j.id").alias("j"),
        sim_cos("i.feature", "j.feature").cast("float").alias("sim_cosine"),
        F.col("j.name").alias("j_name"),
        F.col("j.lang").alias("j_lang"),
        F.col("j.desc").alias("j_desc"))\
    .filter(F.col('id').isin(courses_list))\
    .sort("i", "j")


CPU times: user 18.7 ms, sys: 1.07 ms, total: 19.8 ms
Wall time: 418 ms


In [21]:
%%time
# тут спарк считает близость, самый тяжелый процесс
cossimmatrix.show(3, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [22]:
cossimmatrix.printSchema()

root
 |-- id: integer (nullable = true)
 |-- i_name: string (nullable = true)
 |-- i_lang: string (nullable = true)
 |-- i_desc: string (nullable = true)
 |-- i: integer (nullable = true)
 |-- j: integer (nullable = true)
 |-- sim_cosine: float (nullable = true)
 |-- j_name: string (nullable = true)
 |-- j_lang: string (nullable = true)
 |-- j_desc: string (nullable = true)



In [25]:
# наши полученные оценки близости
cossimmatrix.select('i', 'j', 'sim_cosine').orderBy(F.col('sim_cosine').desc()).show(20)

+-----+-----+----------+
|    i|    j|sim_cosine|
+-----+-----+----------+
|18030| 3660|0.63669133|
|18030| 8098|0.63669133|
|18030|26336| 0.6249196|
|18030|26670|0.62393713|
|18030|20763|0.61926955|
|18030|17838| 0.6167861|
|18030| 7944|0.61545837|
|18030|21053|0.61187696|
|18030|  387|0.60660166|
|18030| 4096| 0.6064375|
|18030| 6864| 0.6050712|
|18030|17200| 0.5963444|
|18030|21337| 0.5952187|
|18030|13275| 0.5937647|
|18030|22680| 0.5921054|
|18030|22284| 0.5873646|
|18030|12413| 0.5872974|
|18030|10035| 0.5859689|
|18030|16924|0.58521444|
|18030|13102|0.58461344|
+-----+-----+----------+
only showing top 20 rows



In [26]:
# Выбираем топ 10 наших оценок близости для каждого таргета
output = {}
for elem in courses_list:
    output[elem[0]] = cossimmatrix.select(F.col('j'))\
                                    .where(F.col('id') == elem[0])\
                                    .where(F.col('j_lang') == elem[1])\
                                    .orderBy(F.desc('sim_cosine'), 
                                             F.asc('j_name'), 
                                             F.asc('j'))\
                                    .limit(10)\
                                    .rdd.flatMap(lambda x: x)\
                                    .collect()
output
# {16871: [20182, 19809, 12363, 12952, 20534, 13127, 20183, 19810, 13125, 7397],
#  12234: [2164, 2162, 23256, 2161, 8101, 3745, 164, 3146, 12384, 15925],
#  9029: [23114, 6864, 3660, 8098, 22680, 21400, 26336, 26670, 4096, 4743],
#  18030: [3660, 8098, 26336, 26670, 20763, 17838, 7944, 21053, 4096, 387],
#  8170: [1311, 8169, 1310, 20352, 1305, 1325, 13685, 8007, 17127, 867],
#  1200: [1208, 8212, 19419, 1209, 1187, 1204, 923, 1004, 20347, 1343]}

{16871: [20182, 19809, 12363, 12952, 20534, 13127, 20183, 19810, 13125, 7397],
 12234: [2164, 2162, 23256, 2161, 8101, 3745, 164, 3146, 12384, 2160],
 9029: [23114, 6864, 3660, 8098, 22680, 21400, 26336, 26670, 4096, 23629],
 18030: [3660, 8098, 26336, 26670, 20763, 17838, 7944, 21053, 387, 4096],
 8170: [1311, 8169, 1310, 20352, 1305, 1325, 13685, 8007, 17127, 867],
 1200: [1208, 8212, 1204, 1209, 1187, 19419, 1004, 20347, 923, 1343]}

## Имеем чекер на стороне по скрытой выборке

[[1200, 0.6], [8170, 0.7], [9029, 0.9], [12234, 0.7], [16871, 0.9], [18030, 0.7]] 
Достаточно точно

In [27]:
import json

In [28]:
# сохранили файл. Его нужно положить на сервер и отдать чекеру
with open(file='to_chk.json', mode='wt') as file:
    file.write(json.dumps(output) + '\n')

In [29]:
spark.stop()