In [None]:
# BDV

In [None]:
## Imports

In [1]:
from pyspark.sql.functions import round, avg, count, mean, explode, split, col
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[*]') \
    .appName('RDD Exercise') \
    .getOrCreate()
    
spark

## Transform and Load Data

In [2]:
recipes = spark.read.option("delimiter", ";").csv("./data/clean_recipes.csv", header=True, inferSchema=True)
reviews = spark.read.option("delimiter", ",").csv("./data/clean_reviews.csv", header=True, inferSchema=True)

In [3]:
recipes.limit(5).toPandas()

Unnamed: 0,Recipe Name,Review Count,Recipe Photo,Author,Prepare Time,Cook Time,Total Time,Ingredients,Directions,RecipeID
0,Golden Crescent Rolls Recipe,304,https://images.media-allrecipes.com/userphotos...,Mike A.,25 m,15 m,3 h 10 m,"yeast,water,white sugar,salt,egg,butter,flour,...","Dissolve yeast in warm water.**Stir in sugar, ...",7000
1,Poppy Seed Bread with Glaze Recipe,137,https://images.media-allrecipes.com/userphotos...,Christina Jun,15 m,1 h,1 h 20 m,"flour,salt,baking powder,poppy,butter,vegetabl...",'Preheat oven to 350 degrees F (175 degrees C)...,7001
2,Applesauce Bread I Recipe,124,https://images.media-allrecipes.com/userphotos...,GAF55,10 m,1 h 20 m,1 h 30 m,"flour,egg,white sugar,vegetable oil,applesauce...",Preheat oven to 350 degrees F (175 degrees C)....,7003
3,Apple Raisin Bread Recipe,39,https://images.media-allrecipes.com/userphotos...,Helen Hanson,15 m,1 h,1 h 15 m,"flour,baking powder,baking soda,salt,cinnamon,...",Preheat oven to 350 degrees F (175 degrees C)....,7006
4,Buttermilk Oatmeal Bread Recipe,41,https://images.media-allrecipes.com/userphotos...,Helen Hanson,10 m,1 h,1 h 40 m,"oat,buttermilk,vegetable oil,egg,brown sugar,f...",Mix oats with buttermilk. Let stand for 1/2 h...,7007


In [4]:
reviews.limit(5).toPandas()

Unnamed: 0,RecipeID,profileID,Rate
0,7000,675719,5.0
1,7000,1478626,5.0
2,7000,608663,5.0
3,7000,2785736,5.0
4,7000,594474,5.0


## Hard Facts

In [5]:
ingredients = recipes.select(explode(split(col("Ingredients"), ",")).alias("Ingredient"), "RecipeID")
best_recipes = reviews\
    .groupBy("RecipeID")\
    .agg(mean("Rate"), count("Rate"))\
    .withColumnRenamed("avg(Rate)", "AvgRate")\
    .withColumnRenamed("count(Rate)", "CountRate")\
    .orderBy("avg(Rate)", ascending=[0])

print(f"recipes.count(): {recipes.count()}")
print(f"reviews.count(): {reviews.count()}")
print(f"ingredients.count(): {ingredients.count()}")
print(f"best_recipes.limit(10):"); best_recipes.limit(10).toPandas()

recipes.count(): 12351
reviews.count(): 1563566
ingredients.count(): 102517
best_recipes.limit(10):


Unnamed: 0,RecipeID,AvgRate,CountRate
0,24311,5.0,1
1,20441,5.0,30
2,9263,5.0,30
3,13603,4.966667,30
4,10819,4.965517,29
5,15297,4.964286,28
6,19207,4.933333,30
7,9153,4.933333,30
8,24221,4.933333,30
9,13481,4.933333,30


## Research Questions
### 1. Welche Zutat wird am häufigsten verwendet?

In [6]:
ingredients.limit(5).toPandas()

Unnamed: 0,Ingredient,RecipeID
0,yeast,7000
1,water,7000
2,white sugar,7000
3,salt,7000
4,egg,7000


In [7]:
ingredients\
    .groupBy("Ingredient")\
    .count()\
    .orderBy("count", ascending=[0])\
    .withColumnRenamed("count", "Count")\
    .limit(10)\
    .toPandas()

Unnamed: 0,Ingredient,Count
0,white sugar,4986
1,egg,4880
2,salt,4516
3,butter,4456
4,flour,4158
5,vanilla,3111
6,milk,2767
7,onion,2763
8,water,2606
9,garlic,2116


### 2. Welche Zutat wird am häufigsten in den 10 bestbewerteten Rezepten verwendet?

In [8]:
best_ten_recipes = best_recipes\
    .join(recipes, on="RecipeID", how="inner")\
    .orderBy("AvgRate", ascending=[0])\
    .limit(10)

best_ten_recipes.toPandas()

Unnamed: 0,RecipeID,AvgRate,CountRate,Recipe Name,Review Count,Recipe Photo,Author,Prepare Time,Cook Time,Total Time,Ingredients,Directions
0,9263,5.0,30,Best Boiled Fruitcake Recipe,50,https://images.media-allrecipes.com/userphotos...,Mark Richards,X,X,X,"fruit,cherry,citrus,walnut,butter,spice,baking...",Preheat oven to 325 degrees F (160 degrees C)....
1,24311,5.0,1,Easy Cinnamon Fudge Recipe,57,https://images.media-allrecipes.com/userphotos...,HANBAN,10 m,10 m,1 h 20 m,"sugar,cocoa powder,cinnamon,butter,milk,vanill...",'Line an 8x8 inch baking pan with aluminum foi...
2,10819,4.965517,29,Ranger Cookies II Recipe,34,https://images.media-allrecipes.com/userphotos...,Rhonda,X,X,X,"shortening,white sugar,brown sugar,egg,vanilla...",Cream shortening with sugars. Beat in egg and...
3,15297,4.964286,28,Special Butterscotch Chip Cookies in a Jar Rec...,50,https://images.media-allrecipes.com/userphotos...,Sally Jo,X,X,X,"flour,baking soda,salt,baking powder,brown sug...",Layer the ingredients in the order given in a ...
4,14305,4.933333,30,Broccoli Salad II Recipe,51,https://images.media-allrecipes.com/userphotos...,Lori J. Sikes,10 m,15 m,25 m,"broccoli,bacon,raisin,mayonnaise,white sugar,w...","Place bacon in a large, deep skillet. Cook ove..."
5,24221,4.933333,30,Sex on the Beach III Recipe,48,https://images.media-allrecipes.com/userphotos...,ANG242424,1 m,X,1 m,"vodka,peach,orange juice,cranberry","Fill a highball glass with ice, pour in vodka ..."
6,19207,4.933333,30,Sandies Recipe,56,https://images.media-allrecipes.com/userphotos...,Jaimie,4 h 15 m,20 m,4 h 35 m,"butter,white sugar,water,vanilla,flour,pecan,s...","'In a medium bowl, cream together the butter a..."
7,7286,4.933333,30,Creme de Menthe Cake I Recipe,49,https://images.media-allrecipes.com/userphotos...,Bonnie,X,X,X,"cake,liqueur,chocolate,topping thawed,liqueur","Prepare 1 box white cake mix as directed, exce..."
8,13481,4.933333,30,Cranberry Salad IV Recipe,37,https://images.media-allrecipes.com/userphotos...,Lenna,5 m,X,3 h 5 m,"cranberry,white sugar,whipping cream,marshmall...","In a small bowl, combine the ground cranberrie..."
9,8422,4.933333,30,Apple Cake in a Jar Recipe,39,https://images.media-allrecipes.com/userphotos...,Julie Wayment,X,X,X,"shortening,white sugar,egg,cinnamon,nutmeg,sal...","""'Preheat oven to 325 degrees F (165 degrees C..."


In [9]:
best_ten_recipes\
    .join(ingredients, on="RecipeID", how="inner")\
    .groupBy("Ingredient")\
    .count()\
    .orderBy("count", ascending=[0])\
    .withColumnRenamed("count", "Count")\
    .limit(5)\
    .toPandas()

Unnamed: 0,Ingredient,Count
0,white sugar,7
1,flour,5
2,butter,4
3,baking soda,4
4,vanilla,3


### 3. Wie lange ist die durchschnittliche Zubereitungszeit für ein Rezept?

### 4. Wie gut sind die Rezepte mit der längsten Zubereitungszeit bewertet?

### 5. Können wir anhand der Zutaten, der Zubereitungsschritte und der Zubereitungszeit vorhersagen wie gut die Bewertung ausfallen wird?