**Building recommender system using content based filtering approach**

The following code:

* Builds weighted one-hot endcoded item embeddings in the feature space
* Projecting customers into the embeddings space
* Performing dimensionality reduction usng PCA and picking the first 150 principal component
* Finds N similar items using ApproximateNearestKneighbor from spark MLLib

Cold start approach: recommend most frequent items.

Input data limited to 10000 transactions due to memory constraints



In [2]:
!pip install pyspark



In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col, lit, lower
from pyspark.ml.feature import BucketedRandomProjectionLSH

features = ['article_id', 'prod_name', 'product_type_name',
       'product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name', 'detail_desc']

pivot_cols = ['product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']

spark = SparkSession.builder.appName('Recommendations').getOrCreate()

transactions = spark.read.options(header=True).csv(
    "../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv").drop(
    'sales_channel_id').drop('price').limit(10000)
    

items = spark.read.options(header=True).csv(
    "../input/h-and-m-personalized-fashion-recommendations/articles.csv").select(features)

rcmnds = spark.read.options(header=True).csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv'
                       ).select('customer_id')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/22 05:27:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
transactions

DataFrame[t_dat: string, customer_id: string, article_id: string]

In [5]:
items

DataFrame[article_id: string, prod_name: string, product_type_name: string, product_group_name: string, graphical_appearance_name: string, colour_group_name: string, perceived_colour_value_name: string, perceived_colour_master_name: string, department_name: string, index_name: string, index_group_name: string, section_name: string, garment_group_name: string, detail_desc: string]

In [6]:
rcmnds

DataFrame[customer_id: string]

In [7]:
def to_lower(items):
    for c in pivot_cols:
        items = items.withColumn(c, lower(col(c)))
    
    return items

In [8]:
def ohe(items):
    keys = ['article_id']
    def join_all(dfs,keys):
        if len(dfs) > 1:
            return dfs[0].join(join_all(dfs[1:],keys), on = keys, how = 'inner')
        else:
            return dfs[0]

    dfs = []
    combined = []
    for pivot_col in pivot_cols:
        pivotDF = items.groupBy(keys).pivot(pivot_col).count()
        new_names = pivotDF.columns[:len(keys)] +  ["e_{0}_{1}".format(pivot_col, i) for i, c in enumerate(pivotDF.columns[len(keys):])]        
        newdf = pivotDF.toDF(*new_names).fillna(0)    
        combined.append(newdf)

    item_feature = join_all(combined,keys)
    
    return item_feature

In [9]:
items = to_lower(items)

In [10]:
item_feature = ohe(items)

                                                                                

In [11]:
transactions = transactions.join(item_feature, on='article_id', how='left').sort('t_dat').drop(*features[1:])

In [12]:
dummy_features = transactions.columns[3:]

In [13]:
user_feature = transactions.groupBy('customer_id').sum(*dummy_features)

In [None]:
# from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA

# def get_pca(df, col):
    
    
#     assembler = VectorAssembler(inputCols=df.columns[1:], outputCol="sparse_features")
    
#     feature_vectors = assembler.transform(df).select(*(col, "sparse_features"))


#     scaler = StandardScaler(inputCol="sparse_features", outputCol="scaled_features")
#     scalerModel = scaler.fit(feature_vectors)

#     scaled_feature_vectors = scalerModel.transform(feature_vectors).select(*(col, "scaled_features"))

#     pca = PCA(k=100, inputCol="scaled_features", outputCol="pca")
#     pcaModel = pca.fit(scaled_feature_vectors)
#     x = pcaModel.transform(scaled_feature_vectors).select(*(col, "pca"))
    
#     return x


# user_feature_pca = get_pca(weighted_user_feature, 'customer_id')
# item_feature_pca = get_pca(item_feature, 'article_id')

In [14]:
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA
from pyspark.ml import Pipeline

def scale(df, col):
    
    assembler = VectorAssembler(inputCols=df.columns[1:], outputCol="sparse_features")
    feature_vectors = assembler.transform(df).select(*(col, "sparse_features"))

    scaler = StandardScaler(inputCol="sparse_features", outputCol="scaled_features")
    scalerModel = scaler.fit(feature_vectors)
    
    scaled_feature_vectors = scalerModel.transform(feature_vectors).select(*(col, "scaled_features"))
    
    return scaled_feature_vectors


def get_pca(df, col):
    
    pca = PCA(k=100, inputCol="scaled_features", outputCol="pca")
    pcaModel = pca.fit(df)
    
    return pcaModel

In [15]:
scaled_user_feature = scale(user_feature, 'customer_id')
scaled_item_feature = scale(item_feature, 'article_id')

23/02/22 05:27:56 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 115:>                                                        (0 + 1) / 1]

23/02/22 05:28:12 WARN DAGScheduler: Broadcasting large task binary with size 1091.4 KiB


                                                                                

23/02/22 05:28:41 WARN DAGScheduler: Broadcasting large task binary with size 1092.0 KiB




23/02/22 05:28:56 WARN DAGScheduler: Broadcasting large task binary with size 1242.3 KiB


[Stage 185:>                                                        (0 + 1) / 1]

23/02/22 05:29:00 WARN DAGScheduler: Broadcasting large task binary with size 1790.8 KiB


[Stage 210:>                                                        (0 + 1) / 1]

23/02/22 05:29:03 WARN DAGScheduler: Broadcasting large task binary with size 1904.5 KiB


[Stage 262:>                (0 + 1) / 1][Stage 266:>                (0 + 1) / 1]

23/02/22 05:29:16 WARN DAGScheduler: Broadcasting large task binary with size 1160.9 KiB




23/02/22 05:29:47 WARN DAGScheduler: Broadcasting large task binary with size 1196.6 KiB


                                                                                

In [16]:
pca_model = get_pca(scaled_user_feature, 'customer_id')



23/02/22 05:29:59 WARN DAGScheduler: Broadcasting large task binary with size 1088.3 KiB


                                                                                

23/02/22 05:30:25 WARN DAGScheduler: Broadcasting large task binary with size 1089.0 KiB


[Stage 388:>                                                        (0 + 4) / 4]

23/02/22 05:30:40 WARN DAGScheduler: Broadcasting large task binary with size 1239.3 KiB


[Stage 412:>                                                        (0 + 1) / 1]

23/02/22 05:30:43 WARN DAGScheduler: Broadcasting large task binary with size 1668.2 KiB


                                                                                

23/02/22 05:30:44 WARN DAGScheduler: Broadcasting large task binary with size 1668.2 KiB


                                                                                

23/02/22 05:30:45 WARN DAGScheduler: Broadcasting large task binary with size 1670.2 KiB


                                                                                

23/02/22 05:30:46 WARN DAGScheduler: Broadcasting large task binary with size 1668.7 KiB


                                                                                

23/02/22 05:30:48 WARN DAGScheduler: Broadcasting large task binary with size 1669.4 KiB


                                                                                

23/02/22 05:30:50 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
23/02/22 05:30:50 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


In [17]:
user_feature_pca = pca_model.transform(scaled_user_feature)
item_feature_pca = pca_model.transform(scaled_item_feature)

In [18]:
user_feature_pca

DataFrame[customer_id: string, scaled_features: vector, pca: vector]

In [19]:
item_feature_pca

DataFrame[article_id: string, scaled_features: vector, pca: vector]

In [None]:
# user_feature_pca.write.parquet("./user_feature_pca.parquet")
# item_feature_pca.write.parquet("./item_feature_pca.parquet")

In [20]:
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.sql.functions import col, udf
import pyspark.sql.functions as F

def get_rcmnds(customer, k=12):
    brp = BucketedRandomProjectionLSH(inputCol="pca", outputCol="hashes", seed=12345, bucketLength=1.0)
    model = brp.fit(user_feature_pca)
    temp = model.approxNearestNeighbors(item_feature_pca, customer.pca, k).select('article_id').collect()
    return temp

In [21]:
flagged = rcmnds.join(user_feature_pca.withColumn('flag', F.lit(True)), 'customer_id', 'left').fillna(False)

cold_start = flagged.where('!flag').drop('flag')
with_history = flagged.where('flag').drop('flag')

In [22]:
rows = with_history.collect()

[Stage 571:>                                                        (0 + 1) / 1]

23/02/22 05:31:07 WARN DAGScheduler: Broadcasting large task binary with size 1091.4 KiB


                                                                                

23/02/22 05:31:41 WARN DAGScheduler: Broadcasting large task binary with size 1092.0 KiB




23/02/22 05:31:55 WARN DAGScheduler: Broadcasting large task binary with size 1242.4 KiB


[Stage 641:>                                                        (0 + 1) / 1]

23/02/22 05:31:59 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


                                                                                

In [23]:
rows[0]

Row(customer_id='000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318', scaled_features=SparseVector(467, {16: 2.3241, 44: 0.9758, 50: 0.6153, 73: 2.609, 100: 0.4532, 101: 1.0403, 108: 0.6199, 119: 1.8555, 169: 3.4127, 192: 1.578, 382: 1.9109, 387: 0.8273, 436: 2.5315, 464: 2.1834}), pca=DenseVector([-1.5512, 0.7372, 1.9728, -2.3521, -2.3177, 0.2438, 1.5743, -1.258, -2.4537, -0.3919, 1.0904, -0.2851, -0.2111, 0.0758, 0.4033, 0.6975, 0.0137, 0.1488, 0.0697, -0.3862, -0.081, -0.149, -0.1779, -0.0541, -0.0647, 0.1924, 0.0005, 0.8418, 0.5, 0.0659, -0.0915, 0.1752, 0.2201, -0.2583, -0.3694, -0.4289, 0.2696, 0.2568, -0.212, -0.1437, 0.0077, 0.2443, 0.0297, 0.0317, 0.1871, -0.3317, -0.3604, -0.3497, -0.5827, 0.0597, 0.1464, 0.0459, -0.4952, 0.1354, 0.2018, 0.1793, 0.0304, -0.2281, -0.0038, 0.4217, -0.2414, 0.0497, -0.3532, -0.6692, -0.1722, -0.0783, 0.326, -0.689, -0.1874, -0.3543, 0.1921, -0.3941, 0.1569, 0.114, -0.5421, 0.0324, -0.6049, -0.9496, 0.1455, -0.5577, 0.406, -0.0216, 

In [None]:
customers = []
items = []
for row in rows:
    temp = get_rcmnds(row)
    customers.append(row[0])
    items.append(' '.join([i[0] for i in temp]))

[Stage 689:=> (1 + 1) / 2][Stage 695:=> (1 + 1) / 2][Stage 697:>  (0 + 1) / 1]4]

23/02/22 05:32:24 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:33:07 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 795:>  (0 + 1) / 1][Stage 797:=> (1 + 1) / 2][Stage 799:>  (0 + 1) / 1]  

23/02/22 05:33:45 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 842:=> (1 + 1) / 2][Stage 848:=> (1 + 1) / 2][Stage 850:>  (0 + 1) / 1]  

23/02/22 05:34:19 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 901:>                                                        (0 + 1) / 1]

23/02/22 05:34:46 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:35:10 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1003:>                                                       (0 + 1) / 1]

23/02/22 05:35:35 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:36:00 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:36:24 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1156:>                                                       (0 + 1) / 1]

23/02/22 05:36:48 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1207:>                                                       (0 + 1) / 1]

23/02/22 05:37:13 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:37:37 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1309:>                                                       (0 + 1) / 1]

23/02/22 05:38:01 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:38:26 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:38:50 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1462:>                                                       (0 + 1) / 1]

23/02/22 05:39:14 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1505:=>(1 + 1) / 2][Stage 1511:=>(1 + 1) / 2][Stage 1513:> (0 + 1) / 1]  

23/02/22 05:39:38 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1564:>                                                       (0 + 1) / 1]

23/02/22 05:40:03 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1615:>                                                       (0 + 1) / 1]

23/02/22 05:40:27 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1666:>                                                       (0 + 1) / 1]

23/02/22 05:40:53 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1717:>                                                       (0 + 1) / 1]

23/02/22 05:41:18 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1768:>                                                       (0 + 1) / 1]

23/02/22 05:41:42 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1819:>                                                       (0 + 1) / 1]

23/02/22 05:42:07 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1862:=>(1 + 1) / 2][Stage 1868:=>(1 + 1) / 2][Stage 1870:> (0 + 1) / 1]4]

23/02/22 05:42:31 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 1921:>                                                       (0 + 1) / 1]

23/02/22 05:42:55 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:43:19 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:43:44 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:44:08 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:44:33 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 2176:>                                                       (0 + 1) / 1]

23/02/22 05:44:58 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 2219:=>(1 + 1) / 2][Stage 2225:=>(1 + 1) / 2][Stage 2227:> (0 + 1) / 1]4]

23/02/22 05:45:23 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:45:48 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:46:12 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 2380:>                                                       (0 + 1) / 1]

23/02/22 05:46:36 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 2431:>                                                       (0 + 1) / 1]

23/02/22 05:46:59 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:47:24 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 2531:>               (0 + 2) / 2][Stage 2533:>               (0 + 1) / 1]

23/02/22 05:47:48 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 2584:>                                                       (0 + 1) / 1]

23/02/22 05:48:13 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:48:38 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 2686:>                                                       (0 + 1) / 1]

23/02/22 05:49:02 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 2735:>               (0 + 2) / 2][Stage 2737:>               (0 + 1) / 1]

23/02/22 05:49:27 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 2788:>                                                       (0 + 1) / 1]

23/02/22 05:49:52 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:50:16 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 2882:=>(1 + 1) / 2][Stage 2888:> (0 + 2) / 2][Stage 2890:> (0 + 1) / 1]4]

23/02/22 05:50:41 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 2941:>                                                       (0 + 1) / 1]

23/02/22 05:51:06 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:51:31 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:51:56 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 3086:=>(1 + 1) / 2][Stage 3092:=>(1 + 1) / 2][Stage 3094:> (0 + 1) / 1]  

23/02/22 05:52:21 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:52:46 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:53:12 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 3247:>                                                       (0 + 1) / 1]

23/02/22 05:53:37 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:54:02 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:54:28 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 3400:>                                                       (0 + 1) / 1]

23/02/22 05:54:52 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 3451:>                                                       (0 + 1) / 1]

23/02/22 05:55:17 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:55:41 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:56:05 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:56:30 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 3655:>                                                       (0 + 1) / 1]

23/02/22 05:56:54 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:57:19 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:57:44 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB




23/02/22 05:58:10 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


[Stage 3855:> (0 + 1) / 1][Stage 3857:=>(1 + 1) / 2][Stage 3859:> (0 + 1) / 1]  

23/02/22 05:58:34 WARN DAGScheduler: Broadcasting large task binary with size 1489.5 KiB


                                                                                

In [None]:
import pandas as pd
with_history_df = pd.DataFrame({'customer_id':customers, 'items':items})
most_freq = transactions.groupBy('article_id').count().sort(col('count').desc()).limit(12).collect()

default = [i[0] for i in most_freq]
cold_start = cold_start.withColumn('items', lit(' '.join(default))).select(*('customer_id', 'items')).toPandas()

cold_start.head()

In [None]:
all_rcmnds = cold_start.append(with_history_df)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator=RegressionEvaluator(metricName="rmse",labelCol="count",predictionCol="prediction")
rmse=evaluator.evaluate(all_rcmnds)
print(rmse)