In [1]:
import numpy as np
import pandas as pd
import scipy

In [2]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.sql import Row
from IPython.display import clear_output

In [4]:
from pyspark.ml.recommendation import ALS

In [5]:
df_test = sc.textFile('/labs/lab09data/test.csv')\
    .repartition(15)\
    .filter(lambda x: x != 'userId,movieId,rating')\
    .map(lambda x: x.split(','))\
    .map(lambda xs: Row(userId=int(xs[0]), movieId=int(xs[1])))\
    .toDF()

In [6]:
df_ratings = sc.textFile('/labs/lab09data/train.csv')\
    .repartition(15)\
    .filter(lambda x: x != 'userId,movieId,rating')\
    .map(lambda x: x.split(','))\
    .map(lambda xs: Row(userId=int(xs[0]), movieId=int(xs[1]), rating=float(xs[2])))\
    .toDF()

In [7]:
#"MEMORY_AND_DISK"
als = ALS(
    rank=10,
    maxIter=10,
    regParam=0.1,
    numUserBlocks=10,
    numItemBlocks=10,
    implicitPrefs=False,
    alpha=1.0,
    userCol="userId",
    itemCol="movieId",
    seed=23,
    ratingCol="rating",
    nonnegative=False,
    checkpointInterval=10,
    intermediateStorageLevel="MEMORY_ONLY",
    finalStorageLevel="MEMORY_AND_DISK",
    coldStartStrategy="nan"
)

In [8]:
model = als.fit(df_ratings)
model.rank

10

In [9]:
predicts = model.transform(df_test).collect()

In [12]:
!hadoop fs -get /labs/lab09data/train.csv

In [16]:
!hadoop fs -get /labs/lab09data/test.csv

In [13]:
!ls -ls train.csv

165780 -rw-r--r-- 1 valery.baranov valery.baranov 169755196 Nov 21 19:48 train.csv


In [14]:
df_train = pd.read_csv('train.csv')

In [17]:
df_tests = pd.read_csv('test.csv')

In [18]:
df_tests.drop('rating', axis=1, inplace=True)

In [19]:
df_tests.head(n=5)

Unnamed: 0,userId,movieId
0,1,1414
1,1,2346
2,1,5278
3,1,9303
4,1,11817


In [20]:
df_predicts = pd.DataFrame(predicts)

In [21]:
df_predicts.columns = ['movieId', 'userId', 'rating']

In [22]:
df_predicts.head(n=5)

Unnamed: 0,movieId,userId,rating
0,148,178586,2.596558
1,148,94231,3.336398
2,148,155572,2.985319
3,148,3855,2.458445
4,463,31750,3.242199


In [23]:
avg = df_predicts['rating'].mean(skipna=True)
avg

3.401620592389515

In [24]:
df_result = pd.merge(df_tests, df_predicts, how='left', on=['userId', 'movieId'])

In [25]:
df_result.head(n=10)

Unnamed: 0,userId,movieId,rating
0,1,1414,3.948114
1,1,2346,3.858106
2,1,5278,3.130264
3,1,9303,3.914655
4,1,11817,4.533725
5,1,12262,3.988801
6,1,13428,3.941446
7,1,14518,3.629111
8,1,18243,3.866331
9,1,18560,3.962296


In [26]:
df_result.fillna(avg, inplace=True)

In [27]:
df_result.loc[df_result.rating > 5, 'rating'] = 5.0

In [28]:
df_result.to_csv('lab09.csv',index=False)

In [29]:
df_result.to_csv('lab09s.csv',index=False)

In [32]:
!head lab09.csv

userId,movieId,rating
1,1414,3.9481141567230225
1,2346,3.8581061363220215
1,5278,3.1302642822265625
1,9303,3.9146549701690674
1,11817,4.533724784851074
1,12262,3.9888010025024414
1,13428,3.941446304321289
1,14518,3.6291112899780273
1,18243,3.866330623626709


In [33]:
!head lab09s.csv

userId,movieId,rating
1,1414,3.9481141567230225
1,2346,3.8581061363220215
1,5278,3.1302642822265625
1,9303,3.9146549701690674
1,11817,4.533724784851074
1,12262,3.9888010025024414
1,13428,3.941446304321289
1,14518,3.6291112899780273
1,18243,3.866330623626709
