### https://medium.com/@patelneha1495/recommendation-system-in-python-using-als-algorithm-and-apache-spark-27aca08eaab3

In [1]:
spark

In [2]:
sc

In [3]:
df = spark.read.csv("hdfs://devenv/user/spark/recommendation_system/data/ratings.csv",header=True,
                   schema="COMMODITY_DESC string, household_key string, QUANTITY Integer,\
                   BASKET_ID string, rating float, rating_label Integer") 

In [4]:
df_new = df.select(df['COMMODITY_DESC'],df['household_key'],df['rating_label'])

In [5]:
df_new.show()

+--------------------+-------------+------------+
|      COMMODITY_DESC|household_key|rating_label|
+--------------------+-------------+------------+
|              YOGURT|         2500|           5|
|WATER - CARBONATE...|         2500|           5|
|    WAREHOUSE SNACKS|         2500|           5|
|            VITAMINS|         2500|           5|
|    VEGETABLES SALAD|         2500|           3|
|VEGETABLES - SHEL...|         2500|           5|
|VEGETABLES - ALL ...|         2500|           5|
|   VALUE ADDED FRUIT|         2500|           4|
|           VALENTINE|         2500|           5|
|      TROPICAL FRUIT|         2500|           5|
|            TOMATOES|         2500|           4|
|             TICKETS|         2500|           4|
|                TEAS|         2500|           5|
|     SYRUPS/TOPPINGS|         2500|           3|
|    SUGARS/SWEETNERS|         2500|           5|
|         STONE FRUIT|         2500|           5|
|STATIONERY & SCHO...|         2500|           5|


### Importing important modules

In [8]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

### Converting String to index 
- Before making an ALS model it needs to be clear that ALS only accepts integer value as parameters. Hence we need to convert asin and reviewerID column in index form.
- StringIndexer 
 - MLlib method
 - A label indexer that maps a string column of labels to an ML column of label indices. If the input column is numeric, we cast it to string and index the string values.

In [9]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# encoding
indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in list(set(df_new.columns)-set(['rating_label']))]

# encoding 簡單一點的寫法
#indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in ['COMMODITY_DESC','household_key']]


# make pipeline
pipeline = Pipeline(stages=indexer)

# fit and transform
transformed = pipeline.fit(df_new).transform(df_new)

transformed.show()

+--------------------+-------------+------------+--------------------+-------------------+
|      COMMODITY_DESC|household_key|rating_label|COMMODITY_DESC_index|household_key_index|
+--------------------+-------------+------------+--------------------+-------------------+
|              YOGURT|         2500|           5|                51.0|              234.0|
|WATER - CARBONATE...|         2500|           5|                35.0|              234.0|
|    WAREHOUSE SNACKS|         2500|           5|                81.0|              234.0|
|            VITAMINS|         2500|           5|               162.0|              234.0|
|    VEGETABLES SALAD|         2500|           3|                65.0|              234.0|
|VEGETABLES - SHEL...|         2500|           5|                16.0|              234.0|
|VEGETABLES - ALL ...|         2500|           5|                36.0|              234.0|
|   VALUE ADDED FRUIT|         2500|           4|               104.0|              234.0|

### Creating training and test data

In [10]:
(training,test)=transformed.randomSplit([0.8, 0.2])

### Creating ALS model and fitting data
- rating_label type must be numeric

### https://www.twblogs.net/a/5c2917d6bd9eee01606d2f58
### https://codertw.com/%E7%A8%8B%E5%BC%8F%E8%AA%9E%E8%A8%80/563826/
### https://medium.com/@patelneha1495/recommendation-system-in-python-using-als-algorithm-and-apache-spark-27aca08eaab3

In [11]:
als=ALS(maxIter=5, # 算法迭代次數 maximum number of iterations
        regParam=0.09, #正則項權重
        rank=25, #模型中隱藏因子數目
        userCol="household_key_index", #column name for user ids. Ids must be (or can be coerced into) integers.
        itemCol="COMMODITY_DESC_index", #column name for item ids. Ids must be (or can be coerced into) integers.
        ratingCol="rating_label", #column name for ratings
        coldStartStrategy="drop", #drop any rows in the DataFrame of predictions that contain NaN values.
        # the model for new_user/new_prod,you can't predictions recommendation,
        # so you set drop you will drop the new_user/new_prod
        nonnegative=True) #商品推薦分數是否是非負的


model=als.fit(training)

### Generate predictions and evaluate rmse

In [12]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating_label",predictionCol="prediction")

predictions=model.transform(test)

rmse=evaluator.evaluate(predictions) # Evaluates the output with optional parameters.
print("RMSE="+str(rmse))

RMSE=0.8321281598490806


In [13]:
predictions.toPandas()

Unnamed: 0,COMMODITY_DESC,household_key,rating_label,COMMODITY_DESC_index,household_key_index,prediction
0,PIES,2297,5,148.0,2122.0,4.367234
1,PIES,2075,2,148.0,540.0,3.215449
2,PIES,2247,4,148.0,897.0,4.308607
3,PIES,180,5,148.0,1025.0,4.209348
4,PIES,625,5,148.0,1339.0,4.414486
...,...,...,...,...,...,...
56770,SOAP - LIQUID & BAR,2044,5,89.0,1713.0,4.613431
56771,SOAP - LIQUID & BAR,859,5,89.0,180.0,4.251889
56772,SOAP - LIQUID & BAR,593,4,89.0,1656.0,4.116149
56773,SOAP - LIQUID & BAR,219,5,89.0,89.0,4.203546


### Providing Recommendations

In [14]:
# Generate top 20 recommendations for each user

user_recs=model.recommendForAllUsers(20).show(10)

+-------------------+--------------------+
|household_key_index|     recommendations|
+-------------------+--------------------+
|               1580|[[11, 4.768237], ...|
|                471|[[7, 4.688528], [...|
|               1591|[[1, 4.636035], [...|
|               1342|[[7, 4.7700953], ...|
|               2122|[[292, 4.8108144]...|
|               2142|[[7, 4.797579], [...|
|                463|[[292, 4.7037783]...|
|                833|[[7, 4.6923037], ...|
|               1645|[[7, 4.6384296], ...|
|                496|[[292, 4.6875854]...|
+-------------------+--------------------+
only showing top 10 rows



### Converting back to string form

In [15]:
import pandas as pd

# Generate top 10 recommendations for each user
recs=model.recommendForAllUsers(4).toPandas()


nrecs=recs.recommendations.apply(pd.Series) \
            .merge(recs, right_index = True, left_index = True) \
            .drop(["recommendations"], axis = 1) \
            .melt(id_vars = ['household_key_index'], value_name = "recommendation") \
            .drop("variable", axis = 1) \
            .dropna()
nrecs=nrecs.sort_values('household_key_index')

nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series), nrecs['household_key_index']], axis = 1)
nrecs.columns = [
        
        'ProductID_index',
        'Rating',
        'UserID_index']

# raw data
md=transformed.select(transformed['household_key'],transformed['household_key_index'],transformed['COMMODITY_DESC'],transformed['COMMODITY_DESC_index'])
md=md.toPandas()


dict1 =dict(zip(md['household_key_index'],md['household_key']))
dict2=dict(zip(md['COMMODITY_DESC_index'],md['COMMODITY_DESC']))


nrecs['household_key']=nrecs['UserID_index'].map(dict1)
nrecs['ProductID']=nrecs['ProductID_index'].map(dict2)
nrecs=nrecs.sort_values('household_key')
nrecs.reset_index(drop=True, inplace=True)


new=nrecs[['household_key','ProductID','Rating']]
#new['recommendations'] = list(zip(new.ProductID, new.Rating))
new['recommendations'] = list(new.ProductID)


res=new[['household_key','recommendations']]  
res_new=res['recommendations'].groupby([res.household_key]).apply(list).reset_index()
print(res_new)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


     household_key                                    recommendations
0                1            [CHEESE, SOUP, SOFT DRINKS, BAG SNACKS]
1               10  [FLUID MILK PRODUCTS, SOFT DRINKS, SOUP, BAG S...
2              100  [SOUP, CHEESE, FLUID MILK PRODUCTS, RW FRESH P...
3             1000            [SOFT DRINKS, CHEESE, BAG SNACKS, SOUP]
4             1001  [HAIR CARE PRODUCTS, BAG SNACKS, SOUP, RW FRES...
...            ...                                                ...
2495           995  [RW FRESH PROCESSED MEAT, SOUP, CHEESE, SOFT D...
2496           996  [BAKED BREAD/BUNS/ROLLS, CHEESE, SOFT DRINKS, ...
2497           997  [HOUSEHOLD CLEANG NEEDS, FLUID MILK PRODUCTS, ...
2498           998  [CHEESE, SOFT DRINKS, BAKED BREAD/BUNS/ROLLS, ...
2499           999            [SOUP, SOFT DRINKS, BAG SNACKS, CHEESE]

[2500 rows x 2 columns]


In [27]:
res_new.to_csv('./output_data/res_new.csv',index=False)

In [28]:
def show_recommendation(household_key):
    if household_key not in res_new['household_key']:
        return('Customer not found.')
    else:
        recommendation_list = res_new[res_new['household_key'] == str(household_key)]['recommendations'].tolist()
        return [recommendation_list[0][i] for i in range(0,4)]
    

In [31]:
show_recommendation(1000)

['SOFT DRINKS', 'CHEESE', 'BAG SNACKS', 'SOUP']

In [32]:
import pymysql
import csv
import sys

host = '3.113.29.214'  # '3.113.29.214'
user = 'eric'  # 'eric'
passwd = '123456'  # '123456'
port = 3306
conninfo = {'host' : host ,'port' : port,'user' : user , 'passwd' : passwd, 'db' : 'recommendation_system','charset' : 'utf8mb4'}


def add_csv(path):
    try:
        conn = pymysql.connect(**conninfo)
        cursor = conn.cursor()
        csv_data = csv.reader(open(path))
        cursor.execute("""DROP TABLE IF EXISTS als_1""")
        cursor.execute("""CREATE TABLE als_1 (household_key longtext, recommendations longtext)""")
        
        for row in csv_data:
            if csv_data.line_num == 1:
                continue
            cursor.execute('INSERT INTO als_1 (household_key, recommendations)'\
                           'VALUES("%s", "%s")',row)
        # close the connection to the database.
        conn.commit()
        print("Done")
    except:
        print('異常')
        print(sys.exc_info()[0])
        print(sys.exc_info()[1])
    finally:
        cursor.close()
        conn.close()
        print("db close")

In [33]:
add_csv('./output_data/res_new.csv')

Done
db close
