In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
        .builder
        .config(conf=conf)
        .appName("test")
        .getOrCreate())

sc = spark.sparkContext

In [3]:
my_var = [
    [23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
    [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
    [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
    [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
    [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
    [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [4]:
import pyspark.sql.functions as F

courses = spark.read.format('json').load('/labs/slaba02/DO_record_per_line.json')
courses = courses.filter(F.col('lang').isin(['en', 'es', 'ru']))

In [5]:
from pyspark.sql.types import ArrayType, StringType, IntegerType
import re

def my_findall(s):
    pattern = re.compile(u'[\w\d]{2,}', re.U)
    all_matches = re.findall(pattern, s.lower())
    return all_matches

udf_my_findall = F.udf(my_findall, ArrayType(StringType()))

courses = courses.withColumn('desc_regex', udf_my_findall('desc'))

In [6]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(numFeatures=10000, inputCol='desc_regex', outputCol='tf')
tf = hashingTF.transform(courses)
tf = tf.cache()
idf = IDF(inputCol='tf', outputCol='tfidf').fit(tf)
tfidf = idf.transform(tf)

In [7]:
def my_dot(array):
    return float(array[0].dot(array[1]))

dot_udf = F.udf(my_dot, DoubleType())

dict_to_json = {}

for item in my_var:
    print(item)
    item_id = item[0]
    item_lang = item[1]
    
    cur_item = tfidf.filter(F.col('id') == item_id).select(F.col('tfidf').alias('tfidf_cur'))
    cur_courses = tfidf.filter((F.col('lang') == item_lang) & (F.col('id') != item_id)).select('id', 'name', 'tfidf')
    cur_courses = cur_courses.crossJoin(cur_item)
    
    cos_sim = cur_courses.select('id', 'name',
                                dot_udf(F.array('tfidf', 'tfidf_cur')).alias('dot'),
                                F.sqrt(dot_udf(F.array('tfidf', 'tfidf'))).alias('norm1'),
                                F.sqrt(dot_udf(F.array('tfidf_cur', 'tfidf_cur'))).alias('norm2')) \
                        .selectExpr('id', 'name', 'dot / norm1 / norm2 as cos_sim') \
                        .orderBy(F.col('cos_sim').desc(), F.col('name').asc(), F.col('id').asc())

    result = cos_sim.select('id').limit(10).collect()
    result = [i[0] for i in result]
    dict_to_json[item_id] = result

[23126, 'en', 'Compass - powerful SASS library that makes your life easier']
[21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2']
[16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche']
[11556, 'es', 'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo']
[16704, 'ru', 'Программирование на Lazarus']
[13702, 'ru', 'Математическая экономика']


In [8]:
courses.filter(F.col('id').isin([864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111])).select('id', 'name', 'desc').show(10, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
dict_to_json

{23126: [14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348],
 21617: [21609, 21616, 21608, 22298, 21630, 21628, 21623, 21081, 19417, 21508],
 16627: [11431, 11575, 12247, 17964, 5687, 17961, 16694, 12660, 25010, 5558],
 11556: [16488, 468, 13461, 23357, 19330, 7833, 9289, 10447, 22710, 10384],
 16704: [1236, 1247, 1365, 1273, 20288, 1164, 8186, 1233, 8203, 18331],
 13702: [864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111]}

In [10]:
import json
with open('lab02.json', 'w') as f:
    json.dump(dict_to_json, f, indent=4)

In [11]:
sc.stop()