In [None]:
#!pip install pyspark

In [1]:
%matplotlib inline

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from pyspark import SparkContext, SparkConf, StorageLevel, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

In [3]:
df = spark.read.csv('reports11.csv', header=True)
df=df.orderBy('mostRelevantTags')
df.show()

+--------------------+--------------------+-----+
|                data|    mostRelevantTags|views|
+--------------------+--------------------+-----+
|boston consulting...|                null|    2|
|2020/2021 rpm bas...|                null|    0|
|Open Source Softw...|                null|    2|
|aon software inve...|      Aon , Software|    0|
|emc connectivity ...| EMC , Connectivi...|    0|
|emc software indi...| EMC , Software, ...|    0|
|usg government gl...| USG , Government...|    0|
|3m south africa b...|3M , South Africa...|    0|
|advertising apple...|Advertising, Appl...|    0|
|advertising intel...|Advertising, Inte...|    0|
|advertising new y...|Advertising, New ...|    0|
|agriculture asia ...|   Agriculture, Asia|    0|
|agriculture asia ...|   Agriculture, Asia|    0|
|agriculture coca-...|Agriculture, Coca...|    0|
|agriculture globa...|Agriculture, Glob...|    0|
|agriculture india...|Agriculture, Indi...|    0|
|agriculture india...|Agriculture, Indi...|    0|


In [4]:
df=df.fillna( { 'data':0, 'mostRelevantTags':0 , 'views':0} )

In [5]:
nd=df.select(df['mostRelevantTags'],df['data'],df['views'])
nd.show()

+--------------------+--------------------+-----+
|    mostRelevantTags|                data|views|
+--------------------+--------------------+-----+
|                   0|boston consulting...|    2|
|                   0|2020/2021 rpm bas...|    0|
|                   0|Open Source Softw...|    2|
|      Aon , Software|aon software inve...|    0|
| EMC , Connectivi...|emc connectivity ...|    0|
| EMC , Software, ...|emc software indi...|    0|
| USG , Government...|usg government gl...|    0|
|3M , South Africa...|3m south africa b...|    0|
|Advertising, Appl...|advertising apple...|    0|
|Advertising, Inte...|advertising intel...|    0|
|Advertising, New ...|advertising new y...|    0|
|   Agriculture, Asia|agriculture asia ...|    0|
|   Agriculture, Asia|agriculture asia ...|    0|
|Agriculture, Coca...|agriculture coca-...|    0|
|Agriculture, Glob...|agriculture globa...|    0|
|Agriculture, Indi...|agriculture india...|    0|
|Agriculture, Indi...|agriculture india...|    0|


In [6]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in list(set(nd.columns)-set(['views'])) ]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(nd).transform(nd)
transformed = transformed.select(transformed.mostRelevantTags.cast("string"),transformed.data.cast("string"),transformed.mostRelevantTags_index.cast("integer"),transformed.data_index.cast("integer"),transformed.views.cast("double"))
transformed.show()

+--------------------+--------------------+----------------------+----------+-----+
|    mostRelevantTags|                data|mostRelevantTags_index|data_index|views|
+--------------------+--------------------+----------------------+----------+-----+
|                   0|Open Source Softw...|                    17|       331|  2.0|
|                   0|2020/2021 rpm bas...|                    17|       326|  0.0|
|                   0|boston consulting...|                    17|       455|  2.0|
|      Aon , Software|aon software inve...|                   365|       279|  0.0|
| EMC , Connectivi...|emc connectivity ...|                   101|       228|  0.0|
| EMC , Software, ...|emc software indi...|                   151|       463|  0.0|
| USG , Government...|usg government gl...|                   145|       449|  0.0|
|3M , South Africa...|3m south africa b...|                   278|       394|  0.0|
|Advertising, Appl...|advertising apple...|                   317|       217

In [7]:
(training,test)=transformed.randomSplit([0.8, 0.2])

In [8]:
als=ALS(maxIter=20,regParam=0.5,rank=40,userCol="mostRelevantTags_index",itemCol="data_index",ratingCol="views",coldStartStrategy="drop",nonnegative=True)
model=als.fit(training)

In [9]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="views",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))
predictions.show()

RMSE=0.27890215509810207
+--------------------+--------------------+----------------------+----------+-----+-------------+
|    mostRelevantTags|                data|mostRelevantTags_index|data_index|views|   prediction|
+--------------------+--------------------+----------------------+----------+-----+-------------+
|              Global|global report of ...|                    11|        13|  0.0|          0.0|
|India, Intel, Edu...|india intel educa...|                    15|         6|  0.0|          0.0|
|Germany, Andrew, ...|germany andrew fo...|                    19|         3|  0.0|          0.0|
|Technology, Food,...|technology food g...|                    12|         5|  0.0|          0.0|
|Technology, Food,...|technology food g...|                    12|         5|  0.0|          0.0|
|   Education, Global|education global ...|                     1|         9|  0.0|   0.88196605|
|India, Marketing,...|india marketing i...|                    24|         8|  0.0|          

In [10]:
user_recs=model.recommendForAllUsers(20).show(10)

+----------------------+--------------------+
|mostRelevantTags_index|     recommendations|
+----------------------+--------------------+
|                   392|[[0, 0.0], [10, 0...|
|                   243|[[0, 0.0], [10, 0...|
|                    31|[[421, 50.081444]...|
|                   251|[[0, 0.0], [10, 0...|
|                    53|[[0, 0.0], [10, 0...|
|                   255|[[421, 26.230612]...|
|                   296|[[0, 0.0], [10, 0...|
|                   322|[[0, 0.0], [10, 0...|
|                    78|[[0, 0.0], [10, 0...|
|                   321|[[0, 0.0], [10, 0...|
+----------------------+--------------------+
only showing top 10 rows



In [12]:
import pandas as pd
recs=model.recommendForAllUsers(10).toPandas()
nrecs=recs.recommendations.apply(pd.Series) \
            .merge(recs, right_index = True, left_index = True) \
            .drop(["recommendations"], axis = 1) \
            .melt(id_vars = ['mostRelevantTags_index'], value_name = "recommendation") \
            .drop("variable", axis = 1) \
            .dropna() 
nrecs=nrecs.sort_values('mostRelevantTags_index')
nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series), nrecs['mostRelevantTags_index']], axis = 1)
nrecs.columns = [
        
        'data_index',
        'views',
        'mostRelevantTags_index'
       
     ]
md=transformed.select(transformed['mostRelevantTags'],transformed['mostRelevantTags_index'],transformed['data'],transformed['data_index'])
md=md.toPandas()
dict1 =dict(zip(md['mostRelevantTags_index'],md['mostRelevantTags']))
dict2=dict(zip(md['data_index'],md['data']))
nrecs['mostRelevantTags']=nrecs['mostRelevantTags_index'].map(dict1)
nrecs['data']=nrecs['data_index'].map(dict2)
nrecs=nrecs.sort_values('mostRelevantTags')
nrecs.reset_index(drop=True, inplace=True)
new=nrecs[['mostRelevantTags','data','views']]
new['recommendations'] = list(zip(new.data, new.views))
res=new[['mostRelevantTags','recommendations']]  
res_new=res['recommendations'].groupby([res.mostRelevantTags]).apply(list).reset_index()
print(res_new)

                         mostRelevantTags  \
0     EMC , Connectivity, India, Hardware   
1           EMC , Software, India, Global   
2                USG , Government, Global   
3                                       0   
4      3M , South Africa, Banking, Global   
..                                    ...   
321              United States, Insurance   
322                  Venezuela, Education   
323            cloud, Global, Engineering   
324              cloud, Insurance, Global   
325              eBay, Europe, Technology   

                                       recommendations  
0    [(kenya education empowering women kenya â€™ t...  
1    [(alcoa india insurance the effects iowa â€™ p...  
2    [(india manufacturing progressive nifty 2020, ...  
3    [(intel manufacturing canada jefferies pet ind...  
4    [(government global migrant worker management ...  
..                                                 ...  
321  [(kenya education empowering women kenya â€™ t...  
322

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
