In [1]:
import os
import sys
import pyspark
import pandas as pd
from functools import reduce
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import max as _max, min as _min, col
from pyspark.sql.functions import *
import numpy as np
from IPython.core.display import HTML
from sklearn.metrics.pairwise import cosine_similarity
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.feature import Normalizer
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix

In [2]:
conf = pyspark.SparkConf().setAll([('spark.executor.memory', '10g'),  # ~45g
                                   ('spark.dynamicAllocation.enabled','true'),
                                   ('spark.shuffle.service.enabled', 'true'),
                                   ('spark.authenticate', 'true'),
                                   ('spark.dynamicAllocation.initialExecutors', '20'),
                                   ('spark.dynamicAllocation.minExecutors', '4'),
                                   ('spark.dynamicAllocation.maxExecutors', '20'),
                                   ('spark.dynamicAllocation.executorIdleTimeout', '2000'),
                                   ('spark.dynamicAllocation.cachedExecutorIdleTimeout', '21600'),
                                   ('spark.driver.cores','2'),   # ~4
                                   ('spark.executor.cores', '4'),  # ~40 - max
                                   ("spark.driver.maxResultSize", "4g"),
                                   ('spark.yarn.queue', 'araadh_q1.uhc_oa_pi_dev_sq1'),
                                   ('spark.driver.memory','10g'),
                                   ("spark.sql.execution.arrow.pyspark.enabled", "true")])
spark = SparkSession.builder.appName("Assignment").config(conf=conf).enableHiveSupport().getOrCreate().newSession()
spark

In [3]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [4]:
df = spark.read.format("csv").option("header", "true").option('inferSchema', True).load("/home/parora41/Masters/train.csv")
df.show(2)

+----------+--------------------+------------+----------------+----------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+------+--------------+-----------+
|Unnamed: 0|            track_id|     artists|      album_name|      track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence| tempo|time_signature|track_genre|
+----------+--------------------+------------+----------------+----------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+------+--------------+-----------+
|         0|5SuOikwiRyPMVoIQD...| Gen Hoshino|          Comedy|          Comedy|        73|     230666|   False|       0.676| 0.461|  1|  -6.746|   0|      0.143|      0.0322|         1.01E-6|   0.358|  0.715|87.917|           4.0|   acoustic|
|         1|4qPNDBW1i3p1

In [5]:
df.printSchema()

root
 |-- Unnamed: 0: integer (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- explicit: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- acousticness: string (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: string (nullable = true)
 |-- valence: string (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- track_genre: string (nullable = true)



In [6]:
# Shape of DataFrame
print("Rows : {}\nColumns : {}".format(df.count(), len(df.columns)))

Rows : 114000
Columns : 21


In [7]:
## checking NULLS
null_df = df.select([count(when(col(c).contains('None') |
                               col(c).contains("null") |
                               (col(c) == "NaN") |
                                col(c).isNull() |
                               isnan(c), c)).alias(c) for c in df.columns])
null_df.show()

+----------+--------+-------+----------+----------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+-----------+
|Unnamed: 0|track_id|artists|album_name|track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|track_genre|
+----------+--------+-------+----------+----------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+-----------+
|         0|       0|      4|        10|        11|         0|          0|       0|           0|     0|  0|       0|   0|          0|           0|               0|       0|      0|    0|             0|          0|
+----------+--------+-------+----------+----------+----------+-----------+--------+------------+------+---+--------+----+-----------+-----------

In [8]:
## Removing nulls from the df
df1 = df.filter((df['artists'] != 'null')&(df['album_name'] != 'null')&(df['track_name'] != 'null'))

In [9]:
df1.createOrReplaceTempView("df1")

### Top 5 genre which are most popluar among users

In [10]:
spark.sql(""" select track_genre, sum(popularity) `popularity` from df1 group by track_genre order by popularity desc """).show(5)

+-----------+----------+
|track_genre|popularity|
+-----------+----------+
|   pop-film|   59169.0|
|      k-pop|   56896.0|
|      chill|   53651.0|
|        sad|   52379.0|
|     grunge|   49594.0|
+-----------+----------+
only showing top 5 rows



### Summary of the top 5 tracks

In [11]:
# casting the numerical columns for proper feature creation
sdf1 = spark.sql(""" select distinct track_id,
cast(popularity as int) `popularity`,
cast(duration_ms as int) `duration_ms`,
cast(danceability as float) `danceability`,
cast(energy as float) `energy`,
cast(loudness as float) `loudness`,
cast(mode as int) `mode`,
cast(speechiness as float) `speechiness`,
cast(acousticness as float) `acousticness`,
cast(instrumentalness as float) `instrumentalness`,
cast(liveness as float) `liveness`,
cast(valence as float) `valence`,
cast(tempo as float) `tempo`,
cast(time_signature as int) `time_signature`
from df1 """)
sdf1.createOrReplaceTempView("sdf1")

In [12]:
sdf1.summary().show()

+-------+--------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+
|summary|            track_id|        popularity|       duration_ms|      danceability|           energy|          loudness|             mode|       speechiness|      acousticness|  instrumentalness|          liveness|            valence|             tempo|    time_signature|
+-------+--------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+
|  count|               90460|             90340|             90425|             90366|            90428|             90455|            90454|             90456|        

In [13]:
spark.sql(""" select track_name, max(popularity) `popularity`, avg(danceability) `danceability`,
avg(energy) `energy`,
avg(loudness) `loudness`,
avg(mode) `mode`,
avg(speechiness) `speechiness`,
avg(acousticness) `acousticness`,
avg(instrumentalness) `instrumentalness`,
avg(liveness) `liveness`,
avg(valence) `valence`,
avg(tempo) `tempo`
from df1 
where trim(popularity) between 0 and 100
group by track_name order by popularity desc """).show(5, truncate=False)

+-------------------------------------+----------+------------+------------------+--------+----+-----------+------------+----------------+--------+-------------------+-------+
|track_name                           |popularity|danceability|energy            |loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence            |tempo  |
+-------------------------------------+----------+------------+------------------+--------+----+-----------+------------+----------------+--------+-------------------+-------+
|Quevedo: Bzrp Music Sessions, Vol. 52|99        |0.621       |0.782             |-5.548  |1.0 |0.044      |0.0125      |0.033           |0.23    |0.55               |128.033|
|I'm Good (Blue)                      |98        |0.561       |0.9650000000000001|-3.673  |0.0 |0.0343     |0.00383     |7.07E-6         |0.371   |0.30400000000000005|128.04 |
|La Bachata                           |98        |0.835       |0.679             |-5.329  |0.0 |0.0364     |0.583       

### Getting most similar tracks for recommendation using features similarity

In [14]:
# Creating features out of appropriate columns and normalizing them
vectorAssembler = VectorAssembler(inputCols = ['popularity', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'],
                                  outputCol = 'features')
df2 = vectorAssembler.setHandleInvalid("skip").transform(sdf1)
normalizer = Normalizer(inputCol="features", outputCol="norm")
df3 = normalizer.transform(df2)

In [15]:
df3.show(2)

+--------------------+----------+-----------+------------+------+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+--------------------+
|            track_id|popularity|duration_ms|danceability|energy|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|            features|                norm|
+--------------------+----------+-----------+------------+------+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+--------------------+
|0ltIx8PLjtWoS9zn9...|        51|     322146|       0.592| 0.769|  -5.268|   1|     0.0311|       0.319|         4.66E-6|   0.137|  0.385|102.033|             4|[51.0,0.592000007...|[0.44658398202370...|
|68HbD7rEFTlnJy4UW...|        28|     217466|       0.578| 0.636|  -5.506|   1|     0.0342|      0.0874|             0.0|   0.124|  0.556|  124.9|             4|[28.0,0.578000009...|[0

In [16]:
# crearting "ID" column for processing
df3.createOrReplaceTempView("df3")
df4 = spark.sql(""" select *, row_number() OVER(order by track_id) `ID` from df3 """)
df4.createOrReplaceTempView("df4")

In [17]:
df5 = spark.sql(""" select * from df4 limit 500 """) # test sample
mat = IndexedRowMatrix(
    df5.select("ID", "norm")\
        .rdd.map(lambda row: IndexedRow(row.ID, row.norm.toArray()))).toBlockMatrix()
dot = mat.multiply(mat.transpose())
array_values = dot.toLocalMatrix().toArray()

In [18]:
# sample similarity matrix
pdf = pd.DataFrame(array_values)
pdf.tail(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500
499,0.0,0.948463,0.895373,0.881371,0.923605,0.864495,0.849305,0.776416,0.870619,0.933894,0.83616,0.827627,0.825612,0.956285,0.944794,0.947146,0.844373,0.878069,0.856424,0.857589,0.94904,0.879883,0.917004,0.819931,0.923003,0.926821,0.961061,0.926408,0.953195,0.885807,0.944969,0.94309,0.801398,0.803968,0.828877,0.890163,0.955487,0.844879,0.868807,0.853443,0.791349,0.943923,0.891422,0.897101,0.912741,0.953371,0.933881,0.936089,0.902876,0.9489,0.890379,0.784033,0.922708,0.972001,0.935034,0.820528,0.96459,0.889193,0.770136,0.849771,0.771129,0.941955,0.857909,0.899548,0.897778,0.974283,0.963642,0.809646,0.92017,0.921023,0.937432,0.87905,0.898684,0.783996,0.8511,0.998294,0.903768,0.782581,0.959343,0.87368,0.924701,0.872105,0.924585,0.786315,0.831115,0.972136,0.993889,0.939061,0.877684,0.934837,0.865325,0.842222,0.914939,0.868559,0.858171,0.912628,0.834916,0.913329,0.910108,0.780438,0.86605,0.944329,0.851518,0.78856,0.876064,0.875757,0.966233,0.816794,0.874891,0.877382,0.912661,0.914586,0.777643,0.781168,0.949855,0.84185,0.912164,0.840947,0.777189,0.866214,0.92714,0.818931,0.89439,0.88958,0.848658,0.93505,0.77379,0.775975,0.915045,0.90165,0.892884,0.947711,0.844849,0.954649,0.867379,0.830611,0.969786,0.903144,0.894425,0.851119,0.87864,0.96974,0.873832,0.82097,0.781254,0.865824,0.958968,0.830956,0.880588,0.943794,0.92776,0.855633,0.832673,0.863205,0.877747,0.912868,0.907812,0.77521,0.835298,0.975767,0.77636,0.805029,0.959134,0.782774,0.934627,0.999004,0.910652,0.89747,0.868097,0.944034,0.900293,0.849033,0.780751,0.928345,0.924951,0.950711,0.862501,0.947448,0.964793,0.936067,0.952352,0.825062,0.778953,0.98405,0.932951,0.93234,0.962982,0.792711,0.884671,0.968309,0.801802,0.785114,0.807724,0.777021,0.884246,0.939638,0.892213,0.95298,0.780967,0.818243,0.834004,0.960544,0.87461,0.93715,0.898543,0.844346,0.849226,0.95176,0.897196,0.854118,0.945835,0.881137,0.77773,0.867884,0.792027,0.838607,0.792863,0.959838,0.949299,0.906027,0.816591,0.95904,0.920931,0.936963,0.866927,0.812331,0.871431,0.812425,0.798463,0.908388,0.966366,0.953285,0.834314,0.820649,0.904358,0.859891,0.792172,0.935337,0.845984,0.950753,0.928265,0.913763,0.634835,0.968721,0.837186,0.838955,0.905399,0.919717,0.934643,0.944536,0.77709,0.854798,0.937176,0.954095,0.981882,0.854824,0.925185,0.898598,0.913909,0.772009,0.988622,0.845772,0.839793,0.982248,0.964364,0.930392,0.835499,0.906254,0.946773,0.969821,0.952819,0.968005,0.857097,0.862202,0.940456,0.873452,0.935376,0.7755,0.949376,0.918993,0.924618,0.898604,0.782341,0.967108,0.919539,0.778774,0.914621,0.868614,0.85261,0.916744,0.854742,0.866217,0.882689,0.89758,0.951614,0.939384,0.813059,0.844279,0.913972,0.943661,0.942674,0.921633,0.963589,0.824245,0.880149,0.926607,0.810068,0.963992,0.785583,0.890407,0.906031,0.888364,0.960469,0.884175,0.806473,0.864711,0.945634,0.936965,0.924333,0.90292,0.923735,0.863443,0.865464,0.892436,0.852945,0.907402,0.947136,0.775266,0.905151,0.905552,0.949782,0.853935,0.833285,0.894712,0.881362,0.854803,0.776168,0.776138,0.829135,0.819316,0.888155,0.888303,0.847245,0.980604,0.842879,0.909416,0.845183,0.907328,0.893512,0.804002,0.773303,0.774633,0.793409,0.927579,0.777261,0.903437,0.809087,0.773227,0.83596,0.987886,0.929183,0.841702,0.885926,0.7758,0.840694,0.769022,0.936751,0.923783,0.801966,0.823541,0.918082,0.838542,0.829053,0.886479,0.952858,0.961155,0.873471,0.889769,0.949609,0.93921,0.90013,0.985775,0.8071,0.912572,0.926776,0.921117,0.77444,0.926059,0.860816,0.852578,0.786493,0.92755,0.90429,0.953368,0.857083,0.874788,0.885175,0.826784,0.904621,0.936959,0.842652,0.826048,0.872453,0.820894,0.772282,0.779169,0.97725,0.886896,0.812343,0.929437,0.906311,0.947763,0.885977,0.914819,0.910917,0.876042,0.904134,0.925165,0.775679,0.90325,0.909049,0.776998,0.769692,0.898632,0.859421,0.86549,0.913701,0.944696,0.908531,0.892987,0.886108,0.775142,0.793266,0.880567,0.953708,0.785583,0.774034,0.906162,0.921577,0.886082,0.900109,0.900438,0.94666,0.897466,0.883449,0.880879,0.898938,0.954581,0.946025,0.896556,0.841001,0.97212,0.959351,0.774642,0.92821,0.938577,0.965277,0.86805,0.90357,0.928716,0.856996,0.873633,0.767564,0.792035,0.844941,0.965144,0.891977,0.920391,0.965187,0.835772,0.850365,0.957174,0.95141,0.935319,0.877455,0.949885,0.884257,0.89444,0.925248,0.982574,0.906003,0.790873,0.909208,0.78503,0.885144,0.867087,0.968841,0.887887,0.779333,0.890581,0.879344,0.823293,0.86633,0.897938,0.895086,0.940804,0.941671,0.862256,0.956922,1.0,0.922835
500,0.0,0.98506,0.989822,0.99443,0.980853,0.990821,0.984779,0.958803,0.98622,0.996323,0.981547,0.97599,0.974495,0.952606,0.986677,0.986306,0.98501,0.99336,0.984791,0.98917,0.9738,0.993215,0.994149,0.974487,0.982223,0.990431,0.946462,0.99778,0.979656,0.986588,0.985824,0.977533,0.968188,0.968388,0.980012,0.985662,0.960705,0.982946,0.989072,0.98484,0.965523,0.997987,0.992967,0.99359,0.99431,0.957584,0.981492,0.988662,0.985771,0.989315,0.989101,0.962578,0.988238,0.939398,0.98905,0.977266,0.885685,0.996558,0.955431,0.987013,0.955951,0.974419,0.989263,0.990796,0.998063,0.964171,0.946182,0.968199,0.99155,0.987453,0.982956,0.987525,0.993277,0.962529,0.981566,0.904804,0.998637,0.961853,0.954297,0.987396,0.993464,0.990026,0.986642,0.963313,0.97855,0.955714,0.952419,0.970247,0.988657,0.990431,0.987143,0.981525,0.989327,0.991902,0.986038,0.993841,0.98219,0.984906,0.993216,0.960527,0.98911,0.989971,0.983703,0.964477,0.989148,0.989804,0.95828,0.971448,0.98775,0.991,0.990102,0.989792,0.959479,0.961226,0.985972,0.981075,0.999165,0.981427,0.959241,0.986075,0.990966,0.973977,0.996275,0.99093,0.983265,0.992817,0.957412,0.958608,0.997703,0.99484,0.989116,0.996462,0.985486,0.991531,0.987341,0.977661,0.952779,0.990522,0.99204,0.982102,0.98906,0.962387,0.991582,0.973971,0.961259,0.987085,0.953627,0.977853,0.993253,0.976289,0.982617,0.984933,0.981528,0.9858,0.992549,0.993127,0.998966,0.958172,0.978219,0.892332,0.958776,0.968537,0.976624,0.961956,0.990731,0.924126,0.98994,0.995661,0.989668,0.976198,0.991433,0.985556,0.961017,0.988712,0.98345,0.980648,0.990595,0.975882,0.961226,0.987643,0.965162,0.977126,0.960133,0.964902,0.995651,0.97668,0.960608,0.965989,0.994392,0.953459,0.969946,0.962978,0.968403,0.959112,0.991331,0.982194,0.986865,0.977121,0.961114,0.972807,0.978214,0.92839,0.985346,0.980014,0.99068,0.982446,0.983463,0.969655,0.996815,0.985709,0.974292,0.990637,0.959488,0.988096,0.965794,0.981397,0.966052,0.949208,0.9774,0.991397,0.964169,0.946448,0.983106,0.991876,0.984818,0.973324,0.990996,0.96735,0.967858,0.993528,0.965098,0.975052,0.980069,0.975583,0.994142,0.98909,0.965764,0.980687,0.98218,0.987946,0.982899,0.991942,0.307421,0.956538,0.981517,0.980652,0.994918,0.987847,0.985302,0.979832,0.959152,0.986164,0.983276,0.97389,0.881161,0.987873,0.997345,0.992429,0.983828,0.956481,0.957385,0.983506,0.983202,0.914676,0.957877,0.978219,0.980439,0.98932,0.9741,0.951436,0.975679,0.881247,0.987697,0.988,0.980352,0.98937,0.97901,0.957926,0.960312,0.984774,0.981876,0.990274,0.961771,0.931882,0.984362,0.960048,0.989937,0.992267,0.986287,0.994934,0.986417,0.985846,0.993141,0.988991,0.992383,0.981124,0.972502,0.981501,0.995481,0.974525,0.976756,0.985558,0.98693,0.97564,0.992107,0.991256,0.968191,0.958639,0.963215,0.992682,0.986807,0.990499,0.972965,0.99055,0.970407,0.987736,0.967734,0.986586,0.986355,0.991226,0.988624,0.988425,0.988595,0.995303,0.985142,0.988599,0.973605,0.958235,0.997245,0.990976,0.992805,0.987053,0.980502,0.991899,0.987093,0.986032,0.958698,0.958716,0.9777,0.974171,0.989799,0.995597,0.986437,0.940039,0.984815,0.99555,0.981877,0.989134,0.995894,0.970463,0.957182,0.957897,0.966199,0.976373,0.959269,0.990828,0.96831,0.957143,0.981231,0.855301,0.981401,0.983188,0.991706,0.958514,0.98368,0.954809,0.976821,0.994518,0.968209,0.978092,0.992927,0.980119,0.979828,0.9908,0.967339,0.959601,0.9933,0.989264,0.984209,0.983423,0.993558,0.969718,0.970914,0.987946,0.999903,0.988641,0.957793,0.984724,0.987794,0.985599,0.963566,0.98482,0.996457,0.986349,0.987838,0.987216,0.988926,0.97818,0.992665,0.985919,0.982226,0.977327,0.992326,0.973767,0.956633,0.960182,0.934334,0.990662,0.974254,0.982266,0.987742,0.982319,0.99026,0.999537,0.992464,0.993843,0.988894,0.98823,0.958072,0.996323,0.990015,0.959138,0.955194,0.990299,0.988759,0.989434,0.998792,0.977569,0.991783,0.995642,0.98907,0.958184,0.96612,0.985555,0.83586,0.963215,0.957597,0.990705,0.98355,0.991955,0.992775,0.989913,0.967735,0.986733,0.992275,0.995061,0.998007,0.994991,0.991836,0.985285,0.980681,0.977275,0.957609,0.957904,0.982039,0.983484,0.988049,0.98597,0.99123,0.98548,0.985158,0.990308,0.953922,0.964635,0.981685,0.93941,0.993662,0.988989,0.94193,0.979211,0.983027,0.974902,0.973752,0.987204,0.988996,0.964602,0.98931,0.997292,0.996129,0.975297,0.995212,0.965325,0.985985,0.962929,0.994502,0.991796,0.989266,0.996494,0.960268,0.987864,0.994737,0.977475,0.990146,0.989535,0.988878,0.968915,0.968022,0.984283,0.988276,0.922835,1.0


In [19]:
# df.dropna(inplace=True)
# df.drop_duplicates(subset=['track_name'],inplace=True,keep='first',ignore_index=True)
# df1 = df[['popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]
# df2 = pd.concat([df['track_id'], df1], axis=1)
# df2.head(2)
# df3 = pd.DataFrame(cosine_similarity(df1.head(500)))
# df3.columns = df2['track_id'].head(500)
# df4 = df3.round(3)

# df4 = df4.replace(1.000, 0.0)
# df4['Max'] = df4.idxmax(axis=1)
# df6 = pd.concat([df2[['track_id']].head(500), df4], axis=1)
# df6[['track_id', 'Max']].to_csv('/home/parora41/bds_sample.csv')

In [20]:
spark.stop()