In [1]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, sum, col, abs, count

In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

In [3]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

In [4]:
def norm(df, key1, key2, field, n): 
    
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    topsDF = df.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= n) \
        .drop(col("row_number")) 
        
    tmpDF = topsDF.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
   
    normalizedDF = topsDF.join(tmpDF, key1, "inner") \
        .withColumn("norm_" + field, col(field) / col("sum_" + field)) \
        .cache()

    return normalizedDF

In [5]:
data1 = data.select(
    col('userId').alias('userId'), 
    col('trackId').alias('trackId1'), 
    col('timestamp').alias('timestamp1')
)

data2 = data.select(
    col('userId').alias('userId'), 
    col('trackId').alias('trackId2'), 
    col('timestamp').alias('timestamp2')
)

In [6]:
similarity_count = (data1.join(data2, "userId").cache()
                    .filter(col('trackId1') != col('trackId2'))
                    .filter(abs(col('timestamp1') - col('timestamp2')) <= 420).cache()
                    .groupBy(col('trackId1'), col('trackId2'))
                    .count().alias('count')
                   ).cache()

In [7]:
normalized = norm(similarity_count, "trackId1", "trackId2", "count", 40)

In [8]:
results = (normalized
 .orderBy(col("norm_count").desc(), col("trackId1"), col("trackId2"))
 .limit(40)
)

In [9]:
results = results.select(col("trackId1"), col("trackId2"))

In [10]:
for t1, t2 in results.collect():
    print("{}\t{}".format(t1,t2))

798256	923706	1.0
798319	837992	1.0
798322	876562	1.0
798331	827364	1.0
798335	840741	1.0
798374	816874	1.0
798375	810685	1.0
798379	812055	1.0
798380	840113	1.0
798396	817687	1.0
798398	926302	1.0
798405	867217	1.0
798443	905923	1.0
798457	918918	1.0
798460	891840	1.0
798461	940379	1.0
798470	840814	1.0
798474	963162	1.0
798477	883244	1.0
798485	955521	1.0
798505	905671	1.0
798545	949238	1.0
798550	936295	1.0
798626	845438	1.0
798691	818279	1.0
798692	898823	1.0
798702	811440	1.0
798704	937570	1.0
798725	933147	1.0
798738	894170	1.0
798745	799665	1.0
798782	956938	1.0
798801	950802	1.0
798820	890393	1.0
798833	916319	1.0
798865	962662	1.0
798931	893574	1.0
798946	946408	1.0
799012	809997	1.0
799024	935246	1.0
