In [1]:
from IPython.display import IFrame, Image

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [3]:
from pyspark import SparkContext, SparkConf

conf = SparkConf()
conf.set("spark.app.name", "valeria.lupanova Spark RDD app") 

sc = SparkContext(conf=conf)

In [4]:
sc.getConf().getAll()

[('spark.history.kerberos.keytab', 'none'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.submit.pyFiles',
  '/data/home/valeria.lupanova/.ivy2/jars/org.apache.spark_spark-sql-kafka-0-10_2.11-2.4.5.jar,/data/home/valeria.lupanova/.ivy2/jars/graphframes_graphframes-0.7.0-spark2.4-s_2.11.jar,/data/home/valeria.lupanova/.ivy2/jars/databricks_spark-sklearn-0.2.3.jar,/data/home/valeria.lupanova/.ivy2/jars/org.apache.kafka_kafka-clients-2.0.0.jar,/data/home/valeria.lupanova/.ivy2/jars/org.spark-project.spark_unused-1.0.0.jar,/data/home/valeria.lupanova/.ivy2/jars/org.lz4_lz4-java-1.4.0.jar,/data/home/valeria.lupanova/.ivy2/jars/org.xerial.snappy_snappy-java-1.1.7.3.jar,/data/home/valeria.lupanova/.ivy2/jars/org.slf4j_slf4j-api-1.7.16.jar'),
 ('spark.history.ui.port', '18081'),
 ('spark.driver.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.driver.appUIAddress', 'http://bd-master.newprolab.com:4051'),
 ('spar

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession(sc)

In [6]:
!hdfs dfs -ls /labs/lab07data/DO_record_per_line.json

-rw-r--r--   2 hdfs hdfs   69519728 2020-09-30 12:22 /labs/lab07data/DO_record_per_line.json


In [7]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

schema = StructType([
    StructField('cat', StringType(), True),
    StructField('desc', StringType(), True),
    StructField('id', IntegerType(), True),
    StructField('lang', StringType(), True),
    StructField('name', StringType(), True),
    StructField('provider', StringType(), True)
])

In [8]:
target_ = spark.read.json("/labs/lab07data/DO_record_per_line.json", schema, multiLine=False)

In [9]:
target_.show()

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|
|  14/social_sciences|What’s in your di...| 10|  en|Digital Literacies I|Canvas Network|
|  14/social_sciences|The goal of the D...| 11|  en|Digital Literacie...|Canvas Network|
|  14/social_sciences

###### Удалим из target_ курсы с ненужными языками

In [10]:
import re 
from pyspark.sql import functions as fsql

target_ = target_.select(fsql.col("id"),\
                     fsql.col("lang"),\
                     fsql.col("name"),\
                     fsql.col("desc"),).filter("lang in ('en', 'es', 'ru')")
target_.show()

+---+----+--------------------+--------------------+
| id|lang|                name|                desc|
+---+----+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|This course intro...|
|  5|  en|American Counter ...|This online cours...|
|  7|  en|Becoming a Dynami...|We live in a digi...|
|  8|  en|           Bioethics|This self-paced c...|
|  9|  en|College Foundatio...|This game-based c...|
| 10|  en|Digital Literacies I|What’s in your di...|
| 11|  en|Digital Literacie...|The goal of the D...|
| 12|  en|Digital Tools for...|Ready to explore ...|
| 13|  en|Discover Your Val...|This self-paced c...|
| 14|  en|Enhancing Patient...|What is “interpro...|
| 15|  en|Ethics and Values...|This course prese...|
| 16|  en| Exploring Chemistry|Chemistry is an i...|
| 17|  en|Exploring Enginee...|Are you consideri...|
| 18|  en|Fairy Tales: Orig...|Princess stories ...|
| 19|  en|First Peoples to ...|This first instal...|
| 20|  en| Forums for a Future|This course exa

In [11]:
from pyspark.sql.types import ArrayType

target_ = target_.withColumn("desc",fsql.lower(fsql.col("desc")))

@fsql.pandas_udf(ArrayType(StringType()))
def split_and_filter(x):
    return(x.str.findall('[\w\d]{4,}'))

In [12]:
target_ = target_.withColumn("desc", split_and_filter(target_.desc)) 

target_.show()

+---+----+--------------------+--------------------+
| id|lang|                name|                desc|
+---+----+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|[this, course, in...|
|  5|  en|American Counter ...|[this, online, co...|
|  7|  en|Becoming a Dynami...|[live, digitally,...|
|  8|  en|           Bioethics|[this, self, pace...|
|  9|  en|College Foundatio...|[this, game, base...|
| 10|  en|Digital Literacies I|[what, your, digi...|
| 11|  en|Digital Literacie...|[goal, digital, l...|
| 12|  en|Digital Tools for...|[ready, explore, ...|
| 13|  en|Discover Your Val...|[this, self, pace...|
| 14|  en|Enhancing Patient...|[what, interprofe...|
| 15|  en|Ethics and Values...|[this, course, pr...|
| 16|  en| Exploring Chemistry|[chemistry, integ...|
| 17|  en|Exploring Enginee...|[considering, car...|
| 18|  en|Fairy Tales: Orig...|[princess, storie...|
| 19|  en|First Peoples to ...|[this, first, ins...|
| 20|  en| Forums for a Future|[this, course, 

In [13]:
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import IDF as MLIDF
from pyspark.sql.types import DoubleType

In [14]:
htf = HashingTF(inputCol="desc", outputCol="tf", numFeatures=10000)
tf = htf.transform(target_)
tf.show(truncate=True)

+---+----+--------------------+--------------------+--------------------+
| id|lang|                name|                desc|                  tf|
+---+----+--------------------+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|[this, course, in...|(10000,[36,63,138...|
|  5|  en|American Counter ...|[this, online, co...|(10000,[32,222,36...|
|  7|  en|Becoming a Dynami...|[live, digitally,...|(10000,[493,721,8...|
|  8|  en|           Bioethics|[this, self, pace...|(10000,[32,115,13...|
|  9|  en|College Foundatio...|[this, game, base...|(10000,[56,300,30...|
| 10|  en|Digital Literacies I|[what, your, digi...|(10000,[1045,1263...|
| 11|  en|Digital Literacie...|[goal, digital, l...|(10000,[87,157,15...|
| 12|  en|Digital Tools for...|[ready, explore, ...|(10000,[233,461,8...|
| 13|  en|Discover Your Val...|[this, self, pace...|(10000,[26,696,10...|
| 14|  en|Enhancing Patient...|[what, interprofe...|(10000,[145,234,3...|
| 15|  en|Ethics and Values...|[this, 

In [15]:
idf = MLIDF(inputCol="tf", outputCol="idf")
tfidf = idf.fit(tf).transform(tf)
tfidf.show(truncate=True)

+---+----+--------------------+--------------------+--------------------+--------------------+
| id|lang|                name|                desc|                  tf|                 idf|
+---+----+--------------------+--------------------+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|[this, course, in...|(10000,[36,63,138...|(10000,[36,63,138...|
|  5|  en|American Counter ...|[this, online, co...|(10000,[32,222,36...|(10000,[32,222,36...|
|  7|  en|Becoming a Dynami...|[live, digitally,...|(10000,[493,721,8...|(10000,[493,721,8...|
|  8|  en|           Bioethics|[this, self, pace...|(10000,[32,115,13...|(10000,[32,115,13...|
|  9|  en|College Foundatio...|[this, game, base...|(10000,[56,300,30...|(10000,[56,300,30...|
| 10|  en|Digital Literacies I|[what, your, digi...|(10000,[1045,1263...|(10000,[1045,1263...|
| 11|  en|Digital Literacie...|[goal, digital, l...|(10000,[87,157,15...|(10000,[87,157,15...|
| 12|  en|Digital Tools for...|[ready, explore, ..

In [16]:
b_ = tfidf.select(fsql.col("id").alias("id2"),\
                  fsql.col("lang").alias("lang2"),\
                  fsql.col("name").alias("name2"),\
                  fsql.col("tf").alias("tf2"),\
                  fsql.col("idf").alias("idf2")).filter("id in (15516, 22777, 13131, 5660, 965, 877)")
b_.show()

+-----+-----+--------------------+--------------------+--------------------+
|  id2|lang2|               name2|                 tf2|                idf2|
+-----+-----+--------------------+--------------------+--------------------+
|  877|   ru|Нейрокомпьютинг и...|(10000,[97,381,71...|(10000,[97,381,71...|
|  965|   ru|Основы работы с G...|(10000,[381,467,1...|(10000,[381,467,1...|
| 5660|   es|Escritura Emocion...|(10000,[21,33,120...|(10000,[21,33,120...|
|13131|   es|Instala Google An...|(10000,[21,125,17...|(10000,[21,125,17...|
|15516|   en|Optimizing Perfor...|(10000,[869,1263,...|(10000,[869,1263,...|
|22777|   en|Attract High Payi...|(10000,[26,32,56,...|(10000,[26,32,56,...|
+-----+-----+--------------------+--------------------+--------------------+



In [17]:
r_ = tfidf.select(fsql.col("id"),\
                  fsql.col("lang"),\
                  fsql.col("name"),\
                  fsql.col("tf"),\
                  fsql.col("idf")).filter("id not in (15516, 22777, 13131, 5660, 965, 877)")
r_.show()

+---+----+--------------------+--------------------+--------------------+
| id|lang|                name|                  tf|                 idf|
+---+----+--------------------+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|(10000,[36,63,138...|
|  5|  en|American Counter ...|(10000,[32,222,36...|(10000,[32,222,36...|
|  7|  en|Becoming a Dynami...|(10000,[493,721,8...|(10000,[493,721,8...|
|  8|  en|           Bioethics|(10000,[32,115,13...|(10000,[32,115,13...|
|  9|  en|College Foundatio...|(10000,[56,300,30...|(10000,[56,300,30...|
| 10|  en|Digital Literacies I|(10000,[1045,1263...|(10000,[1045,1263...|
| 11|  en|Digital Literacie...|(10000,[87,157,15...|(10000,[87,157,15...|
| 12|  en|Digital Tools for...|(10000,[233,461,8...|(10000,[233,461,8...|
| 13|  en|Discover Your Val...|(10000,[26,696,10...|(10000,[26,696,10...|
| 14|  en|Enhancing Patient...|(10000,[145,234,3...|(10000,[145,234,3...|
| 15|  en|Ethics and Values...|(10000,

In [18]:
result_ = r_.join(b_, r_.lang == b_.lang2)

result_.show(truncate=True)

+---+----+--------------------+--------------------+--------------------+-----+-----+--------------------+--------------------+--------------------+
| id|lang|                name|                  tf|                 idf|  id2|lang2|               name2|                 tf2|                idf2|
+---+----+--------------------+--------------------+--------------------+-----+-----+--------------------+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|(10000,[36,63,138...|15516|   en|Optimizing Perfor...|(10000,[869,1263,...|(10000,[869,1263,...|
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|(10000,[36,63,138...|22777|   en|Attract High Payi...|(10000,[26,32,56,...|(10000,[26,32,56,...|
|  5|  en|American Counter ...|(10000,[32,222,36...|(10000,[32,222,36...|15516|   en|Optimizing Perfor...|(10000,[869,1263,...|(10000,[869,1263,...|
|  5|  en|American Counter ...|(10000,[32,222,36...|(10000,[32,222,36...|22777|   en|Attract High Payi...|

In [19]:
# UDF для косинуса угла (cosine simularity)
import pyspark.sql.functions as f
@f.udf(FloatType())
def cos_sim(a,b):
    return float(a.dot(b) / (a.norm(2) * b.norm(2)))

In [20]:
output_ = result_.withColumn("cosine_similarity", cos_sim(result_.idf, result_.idf2)) 

In [21]:
from pyspark.sql.window import Window

output_ = output_.withColumn("cosine_similarity", fsql
                             .when(fsql.isnan(fsql.col("cosine_similarity")),0)
                             .otherwise(fsql.col("cosine_similarity")))

In [22]:
df_ = output_.select(fsql.col("id2").alias("course"),\
                     fsql.col("name2").alias("c_name"),\
                     fsql.col("id").alias("recommendation"),\
                     fsql.col("name").alias("r_name"),\
                     fsql.col("cosine_similarity"),\
                     fsql.row_number().over(Window.partitionBy(fsql.col("id2"))\
                                            .orderBy(fsql.col("cosine_similarity").desc(),\
                                                     fsql.col("name").asc())).alias("rn"))

In [23]:
df0_ = df_.select('course', \
                  'c_name',\
                  'recommendation', \
                  'r_name', 'cosine_similarity').filter('rn <= 10')\
.orderBy('course', ascending=True)

df0_.show(60, truncate=False)

+------+----------------------------------------------------------+--------------+------------------------------------------------------------------------------+-----------------+
|course|c_name                                                    |recommendation|r_name                                                                        |cosine_similarity|
+------+----------------------------------------------------------+--------------+------------------------------------------------------------------------------+-----------------+
|877   |Нейрокомпьютинг и его применения в экономике и бизнесе    |8728          |Лекции по истории философии                                                   |0.18717605       |
|877   |Нейрокомпьютинг и его применения в экономике и бизнесе    |8751          |Введение в аналитику больших массивов данных                                  |0.1769678        |
|877   |Нейрокомпьютинг и его применения в экономике и бизнесе    |8179          |Основы поисковой о

In [24]:
df1_ = df0_.select('recommendation').filter('course = 15516').orderBy('rn', ascending=True)
df2_ = df0_.select('recommendation').filter('course = 22777').orderBy('rn', ascending=True)
df3_ = df0_.select('recommendation').filter('course = 13131').orderBy('rn', ascending=True)
df4_ = df0_.select('recommendation').filter('course = 5660').orderBy('rn', ascending=True)
df5_ = df0_.select('recommendation').filter('course = 965').orderBy('rn', ascending=True)
df6_ = df0_.select('recommendation').filter('course = 877').orderBy('rn', ascending=True)

In [25]:
df1_ = df1_.select("recommendation").rdd.flatMap(lambda x: x).collect()
df2_ = df2_.select("recommendation").rdd.flatMap(lambda x: x).collect()
df3_ = df3_.select("recommendation").rdd.flatMap(lambda x: x).collect()
df4_ = df4_.select("recommendation").rdd.flatMap(lambda x: x).collect()
df5_ = df5_.select("recommendation").rdd.flatMap(lambda x: x).collect()
df6_ = df6_.select("recommendation").rdd.flatMap(lambda x: x).collect()

In [26]:
result_dict_ = {} 
result_dict_["15516"] = df1_
result_dict_["22777"] = df2_
result_dict_["13131"] = df3_
result_dict_["5660"] = df4_
result_dict_["965"] = df5_
result_dict_["877"] = df6_

  
print(result_dict_) 

{'15516': [15748, 18523, 20285, 4604, 15832, 6390, 26947, 20963, 15805, 23652], '22777': [18010, 4378, 16589, 19678, 6191, 25314, 9171, 9845, 5949, 7831], '13131': [12694, 17902, 12601, 23111, 6194, 22680, 4095, 6935, 4093, 10144], '5660': [21055, 11574, 19279, 17731, 10992, 13439, 21053, 9289, 3878, 23475], '965': [1010, 1017, 1091, 20331, 961, 1103, 966, 929, 8179, 1337], '877': [8728, 8751, 8179, 1164, 56, 17238, 795, 7173, 12917, 834]}


In [None]:
import json

with open('lab07.json', 'w') as fp:
    json.dump(result_dict_, fp)

In [27]:
df_.select('course', \
                  'c_name',\
                  'recommendation', \
                  'r_name', 'cosine_similarity', 'rn')\
.filter('course in (22777, 13131, 877, 5660, 15516, 965) and rn <= 10')\
.orderBy('course', 'rn', ascending=True)\
.show()

+------+--------------------+--------------+--------------------+-----------------+---+
|course|              c_name|recommendation|              r_name|cosine_similarity| rn|
+------+--------------------+--------------+--------------------+-----------------+---+
|   877|Нейрокомпьютинг и...|          8728|Лекции по истории...|       0.18717605|  1|
|   877|Нейрокомпьютинг и...|          8751|Введение в аналит...|        0.1769678|  2|
|   877|Нейрокомпьютинг и...|          8179|Основы поисковой ...|       0.16368954|  3|
|   877|Нейрокомпьютинг и...|          1164|Основные принципы...|       0.14576572|  4|
|   877|Нейрокомпьютинг и...|            56|Математическая ст...|       0.14070135|  5|
|   877|Нейрокомпьютинг и...|         17238|Информационные те...|       0.13679849|  6|
|   877|Нейрокомпьютинг и...|           795|Введение в вычисл...|       0.13662468|  7|
|   877|Нейрокомпьютинг и...|          7173|Python, структуры...|       0.12783079|  8|
|   877|Нейрокомпьютинг и...|   

In [28]:
sc.stop()