# Recipe Recommender Assignment : Train and Test Split

In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=8565aceb15a22f5b29fa3a69a83f6cc1a2422ef8019cf9968a488039da4bbc05
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [None]:
pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
import os
# give googe drive the required permission
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Lets create first working directory for accessing dataset

# I have created folder in your google drive named  Receipe_Recommendation_Assignment_M in 'My Drive'.
# The said folder will be your home directory for the rest of the project.
# You will be able to save and read data from the folder.

os.chdir("/content/drive/MyDrive/Receipe_Recommendation_Assignment_ML/")
os.getcwd()

'/content/drive/MyDrive/Receipe_Recommendation_Assignment_ML'

In [None]:
from pyspark.sql import SparkSession

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Basics").getOrCreate()

In [None]:
spark

In [None]:

from pyspark.sql import functions as F

# Import for typecasting columns
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType,StringType
from pyspark.sql.types import ArrayType

In [None]:
raw_ratings= (spark.read.csv('/content/drive/MyDrive/Receipe_Recommendation_Assignment_ML/raw_ratings_small.csv', inferSchema = True, header = True))

In [None]:
raw_ratings.show(5)

+-------+---------+------+--------------------+-----------+
|user_id|recipe_id|rating|              review|review_date|
+-------+---------+------+--------------------+-----------+
| 483827|   306785|     5|Being a  huge fan...| 2008-07-15|
|   6258|    20930|     5|Jan  what an inte...| 2002-07-09|
| 102602|    20930|     5|Jan  we love your...| 2003-10-26|
| 296027|   182985|     4|Very nice fresh s...| 2007-05-19|
|   9580|   208980|     5|I saw this on 30 ...| 2007-02-05|
+-------+---------+------+--------------------+-----------+
only showing top 5 rows



In [None]:
raw_ratings.count()

93357

In [None]:
raw_recipes= (spark.read.csv("/content/drive/MyDrive/Receipe_Recommendation_Assignment_ML/raw_recipies_small.csv", inferSchema = True, header = True))

In [None]:
raw_recipes.show(5)

+--------------------+-----+-------+--------------+----------+--------------------+--------------------+-------+--------------------+--------------------+--------------------+-------------+--------------+
|                name|   id|minutes|contributor_id| submitted|                tags|           nutrition|n_steps|               steps|         description|         ingredients|n_ingredients|year_of_review|
+--------------------+-----+-------+--------------+----------+--------------------+--------------------+-------+--------------------+--------------------+--------------------+-------------+--------------+
|beat this  banana...|75452|     70|         15892|2003-11-04|['weeknight', 'ti...|[2669.3, 160.0, 9...|     12|['preheat oven to...|  from ann hodgman's|['sugar', 'unsalt...|            9|          2003|
|      chinese  candy|23933|     15|         35268|2002-03-29|['15-minutes-or-l...|[232.7, 21.0, 77....|      4|['melt butterscot...|a little differen...|['butterscotch ch...|     

In [None]:
raw_recipes.count()

20340

In [None]:
assert (raw_recipes.count(), len(raw_recipes.columns)) == (20340, 13)
assert (raw_ratings.count(), len(raw_ratings.columns)) == (93357, 5)

#### Decide a split date based on the ratings dataframe.

In [None]:
# Find the number of data points in the interaction dataset.
# You can use the count() method.
# The output must be an integer.

num_review_int = raw_ratings.count()

In [None]:
num_review_int

93357

#### Task 01 - Train Test Split

Divide the data into train and test based on the 80 - 20 split using the approach discussed. You will have to save the data in a parquet file.

In [None]:
test_num_reviews_int = round(num_review_int *0.2)
test_num_reviews_int

18671

In [None]:
# Sort the interactions dataset in descending order of review date.
# Extract ```test_num_reviews_int``` most recent reviews.

temp_ratings = (raw_ratings.sort("review_date", ascending=False)
                                 .limit(test_num_reviews_int)
                  )

In [None]:
assert temp_ratings.count()  == 18671
assert raw_recipes.collect()[11][4] <= raw_recipes.collect()[10][4]

In [None]:
temp_ratings.collect()[-1][4]

datetime.date(2011, 7, 17)

Split the data into two parts before and after 2011-07-17.

- All reviews in the ratings data after 2011-07-17 will not exsist in the training set.
- For all future predictions the date will be set at 2011-07-18.   

In [None]:
# Join raw_recipes and raw_ratings
# Use recipe_id as the key to join these dataframes
# The resulting dataframe must have all rows from the raw_ratings dataframe.

interaction_level_df = raw_ratings.join(raw_recipes,raw_ratings.recipe_id==raw_recipes.id,"inner")

In [None]:
interaction_level_df.show(5)

+-------+---------+------+--------------------+-----------+--------------------+---+-------+--------------+----------+--------------------+--------------------+-------+--------------------+-----------+--------------------+-------------+--------------+
|user_id|recipe_id|rating|              review|review_date|                name| id|minutes|contributor_id| submitted|                tags|           nutrition|n_steps|               steps|description|         ingredients|n_ingredients|year_of_review|
+-------+---------+------+--------------------+-----------+--------------------+---+-------+--------------+----------+--------------------+--------------------+-------+--------------------+-----------+--------------------+-------------+--------------+
|  14000|      360|     5|A great dish  lov...| 2002-01-17|baked zucchini fr...|360|     67|          1587|1999-08-09|['weeknight', 'ti...|[200.2, 19.0, 19....|     11|['heat oven to 40...|       null|['zucchini', 'oni...|           11|        

In [None]:
# Use the filter command to separate the datasets.
# All interactions which were rated BEFORE '2011-07-17' will be train data.

train_interaction_level_df  = (interaction_level_df.filter( interaction_level_df.review_date< '2011-07-17'))

In [None]:
train_interaction_level_df.show(8)

+-------+---------+------+--------------------+-----------+--------------------+---+-------+--------------+----------+--------------------+--------------------+-------+--------------------+--------------------+--------------------+-------------+--------------+
|user_id|recipe_id|rating|              review|review_date|                name| id|minutes|contributor_id| submitted|                tags|           nutrition|n_steps|               steps|         description|         ingredients|n_ingredients|year_of_review|
+-------+---------+------+--------------------+-----------+--------------------+---+-------+--------------+----------+--------------------+--------------------+-------+--------------------+--------------------+--------------------+-------------+--------------+
|  14000|      360|     5|A great dish  lov...| 2002-01-17|baked zucchini fr...|360|     67|          1587|1999-08-09|['weeknight', 'ti...|[200.2, 19.0, 19....|     11|['heat oven to 40...|                null|['zucch

In [None]:
# Use the filter command to separate the datasets.
# All interactions which were rated ON OR AFTER '2011-07-17' will be test data.

test_interaction_level_all_recipies_df  = (interaction_level_df.filter(
                                             interaction_level_df.review_date >='2011-07-17'))

In [None]:
test_interaction_level_all_recipies_df.show(5)

+----------+---------+------+--------------------+-----------+--------------------+------+-------+--------------+----------+--------------------+--------------------+-------+--------------------+--------------------+--------------------+-------------+--------------+
|   user_id|recipe_id|rating|              review|review_date|                name|    id|minutes|contributor_id| submitted|                tags|           nutrition|n_steps|               steps|         description|         ingredients|n_ingredients|year_of_review|
+----------+---------+------+--------------------+-----------+--------------------+------+-------+--------------+----------+--------------------+--------------------+-------+--------------------+--------------------+--------------------+-------------+--------------+
|   1911882|   205270|     4|I make pizza in m...| 2013-05-03|bacon cheeseburge...|205270|     37|         89831|2007-01-13|['60-minutes-or-l...|[813.2, 93.0, 13....|      8|['pre-bake the pi...|the 

In [None]:
test_interaction_level_all_recipies_df.count()

18684

In [None]:
assert (test_interaction_level_all_recipies_df.count(), len(test_interaction_level_all_recipies_df.columns)) == (18684, 18)
assert (train_interaction_level_df.count(), len(train_interaction_level_df.columns)) == (74673, 18)

In [None]:
# create data files for modeling

(train_interaction_level_df.coalesce(1)
                           .write.mode('overwrite')
                           .parquet('/content/drive/MyDrive/Receipe_Recommendation_Assignment_ML/train_interaction_level_df.parquet'))  # change the file name and file path

(test_interaction_level_all_recipies_df.coalesce(1)
                                       .write.mode('overwrite')
                                       .parquet('/content/drive/MyDrive/Receipe_Recommendation_Assignment_ML/test_interaction_level_df.parquet'))  # change the file name and file path

    
    ----------**THANK YOU**------------

