# BDV - Recipe analysis

## Imports

In [198]:
from pyspark.sql.functions import round, avg, count, mean, explode, split, col, when
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType

spark = SparkSession.builder \
    .master('local[*]') \
    .appName('recipe-analysis') \
    .getOrCreate()
    
spark

## Transform and Load Data

In [200]:
def convert_time(time_column):
    splitted = split(time_column, " ")
    return when(splitted.getItem(1) == "m", splitted.getItem(0))\
        .when((splitted.getItem(1) == "h") & (splitted.getItem(3) == "m"), splitted.getItem(0) * 60 + splitted.getItem(2))\
        .when(splitted.getItem(1) == "h", splitted.getItem(0) * 60)\
        .when((splitted.getItem(1) == "d") & (splitted.getItem(3) == "h") & (splitted.getItem(5) == "m"), splitted.getItem(0) * 24 * 60 + splitted.getItem(2) * 60 + splitted.getItem(4))\
        .when((splitted.getItem(1) == "d") & (splitted.getItem(3) == "h"), splitted.getItem(0) * 24 * 60 + splitted.getItem(2) * 60)\
        .when((splitted.getItem(1) == "d") & (splitted.getItem(3) == "m"), splitted.getItem(0) * 24 * 60 + splitted.getItem(2))\
        .when(splitted.getItem(1) == "d", splitted.getItem(0) * 24 * 60)\
        .otherwise(0)

recipes = spark\
    .read\
    .option("delimiter", ";")\
    .csv("./data/clean_recipes.csv", header=True, inferSchema=True)\
    .withColumnRenamed("Recipe Name", "RecipeName")\
    .withColumnRenamed("Review Count", "ReviewCount")\
    .withColumnRenamed("Recipe Photo", "RecipePhoto")\
    .withColumnRenamed("Prepare Time", "PrepareTime")\
    .withColumnRenamed("Cook Time", "CookTime")\
    .withColumnRenamed("Total Time", "TotalTime")\
    .withColumn("TotalTimeConverted", convert_time(col("TotalTime")).cast(IntegerType()))

reviews = spark\
    .read\
    .option("delimiter", ",")\
    .csv("./data/clean_reviews.csv", header=True, inferSchema=True)

In [201]:
recipes.limit(5).toPandas()

Unnamed: 0,RecipeName,ReviewCount,RecipePhoto,Author,PrepareTime,CookTime,TotalTime,Ingredients,Directions,RecipeID,TotalTimeConverted
0,Golden Crescent Rolls Recipe,304,https://images.media-allrecipes.com/userphotos...,Mike A.,25 m,15 m,3 h 10 m,"yeast,water,white sugar,salt,egg,butter,flour,...","Dissolve yeast in warm water.**Stir in sugar, ...",7000,190
1,Poppy Seed Bread with Glaze Recipe,137,https://images.media-allrecipes.com/userphotos...,Christina Jun,15 m,1 h,1 h 20 m,"flour,salt,baking powder,poppy,butter,vegetabl...",'Preheat oven to 350 degrees F (175 degrees C)...,7001,80
2,Applesauce Bread I Recipe,124,https://images.media-allrecipes.com/userphotos...,GAF55,10 m,1 h 20 m,1 h 30 m,"flour,egg,white sugar,vegetable oil,applesauce...",Preheat oven to 350 degrees F (175 degrees C)....,7003,90
3,Apple Raisin Bread Recipe,39,https://images.media-allrecipes.com/userphotos...,Helen Hanson,15 m,1 h,1 h 15 m,"flour,baking powder,baking soda,salt,cinnamon,...",Preheat oven to 350 degrees F (175 degrees C)....,7006,75
4,Buttermilk Oatmeal Bread Recipe,41,https://images.media-allrecipes.com/userphotos...,Helen Hanson,10 m,1 h,1 h 40 m,"oat,buttermilk,vegetable oil,egg,brown sugar,f...",Mix oats with buttermilk. Let stand for 1/2 h...,7007,100


In [57]:
reviews.limit(5).toPandas()

Unnamed: 0,RecipeID,profileID,Rate
0,7000,675719,5.0
1,7000,1478626,5.0
2,7000,608663,5.0
3,7000,2785736,5.0
4,7000,594474,5.0


## Hard Facts

In [58]:
ingredients = recipes.select(explode(split(col("Ingredients"), ",")).alias("Ingredient"), "RecipeID")
best_recipes = reviews\
    .groupBy("RecipeID")\
    .agg(mean("Rate"), count("Rate"))\
    .withColumnRenamed("avg(Rate)", "AvgRate")\
    .withColumnRenamed("count(Rate)", "CountRate")\
    .orderBy("avg(Rate)", ascending=[0])

print(f"recipes.count(): {recipes.count()}")
print(f"reviews.count(): {reviews.count()}")
print(f"ingredients.count(): {ingredients.count()}")
print(f"best_recipes.limit(10):"); best_recipes.limit(10).toPandas()

recipes.count(): 12351
reviews.count(): 1563566
ingredients.count(): 102517
best_recipes.limit(10):


Unnamed: 0,RecipeID,AvgRate,CountRate
0,24311,5.0,1
1,20441,5.0,30
2,9263,5.0,30
3,13603,4.966667,30
4,10819,4.965517,29
5,15297,4.964286,28
6,19207,4.933333,30
7,9153,4.933333,30
8,24221,4.933333,30
9,13481,4.933333,30


## Research Questions
### 1. Welche Zutat wird am häufigsten verwendet?

In [59]:
ingredients.limit(5).toPandas()

Unnamed: 0,Ingredient,RecipeID
0,yeast,7000
1,water,7000
2,white sugar,7000
3,salt,7000
4,egg,7000


In [60]:
ingredients\
    .groupBy("Ingredient")\
    .count()\
    .orderBy("count", ascending=[0])\
    .withColumnRenamed("count", "Count")\
    .limit(10)\
    .toPandas()

Unnamed: 0,Ingredient,Count
0,white sugar,4986
1,egg,4880
2,salt,4516
3,butter,4456
4,flour,4158
5,vanilla,3111
6,milk,2767
7,onion,2763
8,water,2606
9,garlic,2116


### 2. Welche Zutat wird am häufigsten in den 10 bestbewerteten Rezepten verwendet?

In [112]:
best_ten_recipes = best_recipes\
    .join(recipes, on="RecipeID", how="inner")\
    .orderBy("AvgRate", ascending=[0])\
    .limit(10)

best_ten_recipes.toPandas()

Unnamed: 0,RecipeID,AvgRate,CountRate,RecipeName,ReviewCount,RecipePhoto,Author,PrepareTime,CookTime,TotalTime,Ingredients,Directions
0,9263,5.0,30,Best Boiled Fruitcake Recipe,50,https://images.media-allrecipes.com/userphotos...,Mark Richards,X,X,X,"fruit,cherry,citrus,walnut,butter,spice,baking...",Preheat oven to 325 degrees F (160 degrees C)....
1,24311,5.0,1,Easy Cinnamon Fudge Recipe,57,https://images.media-allrecipes.com/userphotos...,HANBAN,10 m,10 m,1 h 20 m,"sugar,cocoa powder,cinnamon,butter,milk,vanill...",'Line an 8x8 inch baking pan with aluminum foi...
2,10819,4.965517,29,Ranger Cookies II Recipe,34,https://images.media-allrecipes.com/userphotos...,Rhonda,X,X,X,"shortening,white sugar,brown sugar,egg,vanilla...",Cream shortening with sugars. Beat in egg and...
3,15297,4.964286,28,Special Butterscotch Chip Cookies in a Jar Rec...,50,https://images.media-allrecipes.com/userphotos...,Sally Jo,X,X,X,"flour,baking soda,salt,baking powder,brown sug...",Layer the ingredients in the order given in a ...
4,14305,4.933333,30,Broccoli Salad II Recipe,51,https://images.media-allrecipes.com/userphotos...,Lori J. Sikes,10 m,15 m,25 m,"broccoli,bacon,raisin,mayonnaise,white sugar,w...","Place bacon in a large, deep skillet. Cook ove..."
5,24221,4.933333,30,Sex on the Beach III Recipe,48,https://images.media-allrecipes.com/userphotos...,ANG242424,1 m,X,1 m,"vodka,peach,orange juice,cranberry","Fill a highball glass with ice, pour in vodka ..."
6,19207,4.933333,30,Sandies Recipe,56,https://images.media-allrecipes.com/userphotos...,Jaimie,4 h 15 m,20 m,4 h 35 m,"butter,white sugar,water,vanilla,flour,pecan,s...","'In a medium bowl, cream together the butter a..."
7,7286,4.933333,30,Creme de Menthe Cake I Recipe,49,https://images.media-allrecipes.com/userphotos...,Bonnie,X,X,X,"cake,liqueur,chocolate,topping thawed,liqueur","Prepare 1 box white cake mix as directed, exce..."
8,13481,4.933333,30,Cranberry Salad IV Recipe,37,https://images.media-allrecipes.com/userphotos...,Lenna,5 m,X,3 h 5 m,"cranberry,white sugar,whipping cream,marshmall...","In a small bowl, combine the ground cranberrie..."
9,8422,4.933333,30,Apple Cake in a Jar Recipe,39,https://images.media-allrecipes.com/userphotos...,Julie Wayment,X,X,X,"shortening,white sugar,egg,cinnamon,nutmeg,sal...","""'Preheat oven to 325 degrees F (165 degrees C..."


In [120]:
best_ten_recipes\
    .join(ingredients, on="RecipeID", how="inner")\
    .groupBy("Ingredient")\
    .count()\
    .withColumnRenamed("count", "Count")\
    .orderBy("Count", ascending=[0])\
    .limit(1)\
    .toPandas()

Unnamed: 0,Ingredient,Count
0,white sugar,7


### 3. Wie lange ist die durchschnittliche Zubereitungszeit für ein Rezept?

In [202]:
average_time = recipes\
    .agg(mean("TotalTimeConverted"))\
    .withColumnRenamed("avg(TotalTimeConverted)", "AvgTotalTime")\
    .select("AvgTotalTime")\
    .first()[0]

print(f"average_time(format MMM): {int(average_time)}m")
print(f"average_time(format HH:MM): {'{:02d}h {:02d}m'.format(*divmod(int(average_time), 60))}")

average_time(format MMM): 100m
average_time(format HH:MM): 01h 40m


### 4. Wie gut sind die 10 Rezepte mit der längsten Zubereitungszeit bewertet?

In [218]:
ten_longest_recipes = recipes\
    .orderBy("TotalTimeConverted", ascending=[0])\
    .limit(10)

ten_longest_recipes.toPandas()

Unnamed: 0,RecipeName,ReviewCount,RecipePhoto,Author,PrepareTime,CookTime,TotalTime,Ingredients,Directions,RecipeID,TotalTimeConverted
0,Noel Fruitcake Recipe,2,https://images.media-allrecipes.com/userphotos...,JJOHN32,30 m,3 h 30 m,30 d 4 h,"flour,baking powder,salt,cinnamon,clove,raisin...",Preheat oven to 300 degrees F (150 degrees C)....,8470,43440
1,Thirty Day Friendship Cake Recipe,60,https://images.media-allrecipes.com/userphotos...,Yvonne,1 h,1 h,30 d 2 h,"fruit,peach,white sugar,pineapple,white sugar,...","'Day One: In a large glass jar or bowl, combin...",7900,43320
2,Pickled Garlic Recipe,28,https://images.media-allrecipes.com/userphotos...,Brenda Kraneveldt,45 m,15 m,21 d 1 h,"garlic,bell pepper,vinegar,white sugar,mustard...","Place garlic cloves in a medium bowl, first cu...",25544,30300
3,Homemade Vanilla Recipe,33,https://images.media-allrecipes.com/userphotos...,GINGER P,5 m,X,20 d 20 h,"vodka,vanilla",Submerge vanilla beans in vodka and store in a...,19810,30000
4,Rock Candy Recipe,40,https://images.media-allrecipes.com/userphotos...,Amanda Rader,20 m,X,10 d 10 h,"water,white sugar","""Prepare your candy growing area by thoroughly...",24056,15000
5,Pickled Eggs II Recipe,62,https://images.media-allrecipes.com/userphotos...,Rayna Jordan,15 m,15 m,7 d 23 h 40 m,"egg,vinegar,water,spice,garlic,bay leaf",Place eggs in a medium saucepan and cover with...,22699,11500
6,Tasso Ham Recipe,12,https://images.media-allrecipes.com/userphotos...,DJFoodie,30 m,12 h,7 d 12 h 30 m,"sugar,white sugar,pepper,onion,garlic,water,ro...","To Make the Brine: In a medium bowl, combine c...",20231,10830
7,Corned Beef and Cabbage Recipe,34,https://images.media-allrecipes.com/userphotos...,Ron Brobst,20 m,3 h 30 m,7 d 3 h 30 m,"kosher,water,brisket,bay leaf,black pepper,oni...","In a large stainless steel or cast iron pot, c...",17297,10290
8,Old Time Mincemeat Pie Recipe,11,https://images.media-allrecipes.com/userphotos...,ONEMINA,45 m,40 m,7 d 1 h 25 m,"steak,apple,apple,white sugar,currant,raisin,f...","In a Dutch oven, combine beef and apple cider....",22621,10165
9,Unbaked Fruit Cake Recipe,7,https://images.media-allrecipes.com/images/795...,Paula Seymour,30 m,X,7 d 30 m,"milk,raisin,almond,marshmallow,pineapple,cinna...",Grease one 9x13 inch pan.**Combine all the ing...,9283,10110


In [219]:
ten_longest_recipes\
    .join(best_recipes, on="RecipeID", how="left")\
    .select("RecipeID", "RecipeName", "AvgRate", "CountRate", "TotalTime", "TotalTimeConverted")\
    .orderBy("AvgRate", ascending=[0])\
    .toPandas()

Unnamed: 0,RecipeID,RecipeName,AvgRate,CountRate,TotalTime,TotalTimeConverted
0,7900,Thirty Day Friendship Cake Recipe,4.525424,59.0,30 d 2 h,43320
1,19810,Homemade Vanilla Recipe,4.3,30.0,20 d 20 h,30000
2,17297,Corned Beef and Cabbage Recipe,4.166667,30.0,7 d 3 h 30 m,10290
3,22699,Pickled Eggs II Recipe,4.133333,60.0,7 d 23 h 40 m,11500
4,24056,Rock Candy Recipe,2.833333,30.0,10 d 10 h,15000
5,22621,Old Time Mincemeat Pie Recipe,,,7 d 1 h 25 m,10165
6,20231,Tasso Ham Recipe,,,7 d 12 h 30 m,10830
7,8470,Noel Fruitcake Recipe,,,30 d 4 h,43440
8,7913,Unbaked Fruit Cake Recipe,,,7 d 30 m,10110
9,25544,Pickled Garlic Recipe,,,21 d 1 h,30300


### 5. Können wir anhand der Zutaten, der Zubereitungsschritte und der Zubereitungszeit vorhersagen wie gut die Bewertung ausfallen wird?