In [21]:

# Part 1
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# Define column names for the DataFrame
columns = ['title', 'total_time', 'image', 'ingredients', 'rating_val', 'rating_count', 'category', 
           'cuisine', 'diet', 'vegan', 'vegetarian', 'url']

def collect_page_data(url):
    try:
        # Fetch the webpage content
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')

        row_data = []

        # Extract recipe title
        heading = soup.find('h1')

        # Extract image URL
        image = soup.find('img').get('src')

        # Extract ingredients list
        ingredients = soup.find('ul', class_="ssrcss-1ynsflq-UnorderedList e1q8fsc70").text

        # Extract preparation and cooking time
        first_div = soup.find('dl', class_="ssrcss-160xqny-Wrapper e85aajs0")
        first_div_split = list(first_div.stripped_strings)
        
        total_time = "Prep Time: "
        diet = ""
        vegan = False
        vegetarian = False

        for i in range(len(first_div_split)):
            if i == 1:
                total_time += first_div_split[i] + " | Cook Time: "
            if i == 3:
                total_time += first_div_split[i]
            if i > 7:
                diet += first_div_split[i] + (", " if i + 1 != len(first_div_split) else "")

            # Check for dietary tags
            if first_div_split[i] == "Vegan":
                vegan = True
            if first_div_split[i] == "Vegetarian":
                vegetarian = True

        # Extract rating value and count
        rating_val = soup.find('span', class_='').text.strip()
        ratings_count = soup.find('span', class_='').find_next().find_next().text.strip()

        # Extract category and cuisine from JSON data
        script_tag = soup.find("script", {"type": "application/ld+json"})
        json_data = json.loads(script_tag.string)
        
        category = next((item.get("recipeCategory") for item in json_data["@graph"] if item["@type"] == "Recipe"), None)
        cuisine = next((item.get("recipeCuisine") for item in json_data["@graph"] if item["@type"] == "Recipe"), None)

        # Store extracted data
        row_data.extend([heading.text, total_time, image, ingredients, rating_val, ratings_count,
                         category, cuisine, diet, vegan, vegetarian, url])

        # Save data to a CSV file
        df = pd.DataFrame(columns=columns)
        df.loc[0] = row_data
        df.to_csv('recipe.csv')

        # Print the extracted data
        print(df.to_string())

    except:
        print("Failed to fetch data for:", url)


# Define recipe URLs to scrape
url = "https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700"
url2 = "https://www.bbc.co.uk/food/recipes/easiest_ever_banana_cake_42108"
url3 = "https://www.bbc.co.uk/food/recipes/whole_chicken_and_54954"

# Collect data from each URL
collect_page_data(url)
collect_page_data(url2)
collect_page_data(url3)




                               title                                                   total_time                                                                                           image                                                                                                                                                                                                                                           ingredients rating_val rating_count     category cuisine                                                                diet  vegan  vegetarian                                                               url
0  Avocado pasta with peas and mint   Prep Time: less than 30 mins | Cook Time: less than 10 mins  https://ichef.bbci.co.uk/food/ic/food_16x9_1600/recipes/avocado_pasta_with_peas_31700_16x9.jpg  375g/13oz pasta, such as penne or fusilli1 large avocado (or 2 small) 2 garlic cloves2 tbsp coconut oil, meltedÂ½ tsp salt 1 lemon, juice and zest 6 fresh mint leaves1

In [23]:
#Part 2(Q1-Q3)
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("recipes.csv")
df_cleaned = df.dropna()
print(df_cleaned.describe())

top_10 = df_cleaned.sort_values(by="rating_avg", ascending=False).head(10)
print(top_10[['title', 'rating_avg', 'rating_val']])

average_ratings = df.groupby('title')['rating_avg'].mean()
top_10_recipes = average_ratings.sort_values(ascending=False).head(10)
print(top_10_recipes)
matplotlib.use('TkAgg')
# TkAgg creates the graph outside python.
bootstraps_size = 1000
sample_size = 100
bootstrap_samples = []
for i in range(bootstraps_size):
    sample = df.sample(n=sample_size, replace=True)['rating_avg']
    bootstrap_samples.append(sample.mean())
bootstrap_means = [np.mean(bootstrap_sample) for bootstrap_sample in bootstrap_samples]
confidence_interval_bootstrap = np.percentile(bootstrap_means, [2.5, 97.5])
print(f"Bootstrap Means: {bootstrap_means}")
print(f"Confidence Interval (Bootstrap): {confidence_interval_bootstrap}")
plt.figure(figsize=(10, 6))
plt.scatter(df_cleaned['rating_val'], df_cleaned['rating_avg'], alpha=0.5)
plt.xlabel('Ratings Frequency')
plt.ylabel('Average Rating')
plt.title('Ratings vs. Ratings Frequency')
plt.show()

print(df['rating_val'].describe())
# Example threshold, as the average is 12, anything under 12 should be can be considered as not significant.


#Q4
#2.4a
features=['title','rating_avg','rating_val','total_time','category','cuisine', 'ingredients']

#Creating a daraframe from the CSV file
df = pd.read_csv('recipes.csv')

# Adding the combined_features column
df['combined_features'] = df[features].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)


#2.4b
# Creating a CountVectorizer object and computing the cosine similarity matrix
#initializing the CountVectorizer
CountVectorizer1 = CountVectorizer()

#Fitting and transforming the combined_features column to create a count matrix

count_matrix = CountVectorizer1.fit_transform(df['combined_features'])

#Computing the cosine similarity matrix from the count matrix
cosine_sim_matrix = cosine_similarity(count_matrix)

# Displaying the cosine similarity matrix
print("Cosine Similarity Matrix:")
print(cosine_sim_matrix)


#2.4c

# Recommendation function with URLs
def get_recommendations(title, cosine_sim_matrix, df):
    # Ensure title matching is case-insensitive and strip extra spaces
    title = title.lower().strip()

    # Check if the title exists in the DataFrame
    if title not in df["title"].str.lower().values:
        return ["Recipe not found. Please try a different title."]

    # Get the index of the recipe that matches the title
    idx = df[df['title'].str.lower() == title].index[0]

    # Get the pairwise similarity scores of all recipes with that recipe
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))

    # Sort the recipes based on similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 10 most similar recipes
    sim_scores = sim_scores[1:11]  # Exclude the input recipe itself
    
    # Get the recipe indices
    recipe_indices = [i[0] for i in sim_scores]

    # Get the top 10 most similar recipes and their URLs
    recommended_titles = df["title"].iloc[recipe_indices].tolist()
    recommended_urls = df["recipe_url"].iloc[recipe_indices].tolist()

    # Zip function is used to combine the 2 above lists to a list of tuples
    return list(zip(recommended_titles, recommended_urls))

#Test
recipe_title = "Chicken and Coconut Curry"
recommendations = get_recommendations(recipe_title, cosine_sim_matrix, df)

# Print the recommendations with URLs
print(f"\nRecommendations for '{recipe_title}':")
for i, (title, url) in enumerate(recommendations, start=1):
    print(f"{i}. {title} - {url}")




        Unnamed: 0           id   rating_avg   rating_val   total_time
count  3293.000000  3293.000000  3293.000000  3293.000000  3293.000000
mean   1646.000000  1647.000000     4.497432    12.038567   127.227452
std     950.751545   950.751545     0.611602    25.221305   156.363779
min       0.000000     1.000000     1.000000     1.000000    30.000000
25%     823.000000   824.000000     4.250000     3.000000    60.000000
50%    1646.000000  1647.000000     4.666667     6.000000    90.000000
75%    2469.000000  2470.000000     5.000000    13.000000   150.000000
max    3292.000000  3293.000000     5.000000   776.000000   840.000000
                                    title  rating_avg  rating_val
1646                 Ma's macadamia salad         5.0           1
2419                              Sangria         5.0           3
842          Cranberry and grape focaccia         5.0           1
840   Cranberry and chilli brioche wreath         5.0           2
1773               Middle Easte

In [37]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#Part 3
#3.1
def vec_space_method(recipe, df):
    #Processing categorical and Numerical features
    categorical_columns = ["category", "cuisine", "ingredients"]

    #Numerical columns 
    numerical_columns = ["rating_avg", "total_time"]

    #Vectorize categorical data(convert to text)
    vectorizer = CountVectorizer()


    #convert categorical columns into text
    categorical_features = df[categorical_columns].apply(lambda x: " ".join(x), axis=1)
    categorical_matrix = vectorizer.fit_transform(categorical_features)

    #Normalize the Numerical Features
    #Ensure numerical columns are in appropriate format
    numerical_data = df[numerical_columns].apply(pd.to_numeric, errors="coerce")
    numerical_data = numerical_data.fillna(0)  # Handle NaN values by filling with 0 or mean

    #Scale the numerical data to ensure uniformity in magnitude
    scalar = StandardScaler()
    numerical_matrix = scalar.fit_transform(numerical_data)

    #Combine categorical and numerical
    combined_matrix = np.hstack((categorical_matrix.toarray(), numerical_matrix))

    #Compute the cosine similarity
    cosine_sim_matrix = cosine_similarity(combined_matrix)


    
    #Find the index of the recipie
    recipe = recipe.lower().strip()

    if recipe not in df["title"].str.lower().values:
        return ["Recipe not found. Please try a different title."]

    # Get the index of the recipe that matches the title
    idx = df[df["title"].str.lower() == recipe].index[0]

    #Compute Similarity Scores
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))

    # Sort recipes based on similarity scores (highest similarity first)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar recipes (excluding the input recipe itself)
    sim_scores = sim_scores[1:11]  # Exclude the input recipe itself
    recipe_indices = [i[0] for i in sim_scores]

    #Retrieve the Titles and URLs of the Most Similar Recipes
    recommended_titles = df["title"].iloc[recipe_indices].tolist()
    recommended_urls = df["recipe_url"].iloc[recipe_indices].tolist()

    # Return the top 10 most similar recipes with URLs
    return list(zip(recommended_titles, recommended_urls))


recipe_title = "Chicken and Coconut Curry"
recommendations = vec_space_method(recipe_title, df)

# Print the recommendations with URLs
print(f"Recommendations for '{recipe_title}':")
for i, (title, url) in enumerate(recommendations, start=1):
    print(f"{i}. {title} - {url}")



from sklearn.neighbors import NearestNeighbors

#.3.2
def knn_similarity(recipe,df):
    #Processing categorical and Numerical features
    categorical_columns = ["category", "cuisine", "ingredients"]

    #Numerical columns 
    numerical_columns = ["rating_avg", "total_time"]

    #Vectorize categorical data(convert to text)
    vectorizer = CountVectorizer()

    #converts categorical columns into text
    categorical_features = df[categorical_columns].apply(lambda x: " ".join(x), axis=1)
    categorical_matrix = vectorizer.fit_transform(categorical_features)

    #Normalize the Numerical Features
    #Ensure numerical columns are in appropriate format
    numerical_data = df[numerical_columns].apply(pd.to_numeric, errors="coerce")
    numerical_data = numerical_data.fillna(0)  # Handle NaN values by filling with 0 or mean


    #Scales the numerical data to ensure uniformity in magnitude
    scalar = StandardScaler()
    numerical_matrix = scalar.fit_transform(numerical_data)

    #Combines categorical and numerical
    #function is used to combine categorical and numerical features into a single matrix
    combined_matrix = np.hstack((categorical_matrix.toarray(), numerical_matrix))


    #Finds the index of the recipie
    recipe = recipe.lower().strip()

    if recipe not in df["title"].str.lower().values:
        return ["Recipe not found. Please try a different title."]
    
    # Finds the index of the given recipe
    recipe_index = df[df["title"].str.lower() == recipe].index[0]

    #KNN Model

    knn_model = NearestNeighbors(metric="cosine", algorithm="auto", n_neighbors=11, n_jobs=-1)
    knn_model.fit(combined_matrix)

    dist, ind =  knn_model.kneighbors(combined_matrix[recipe_index].reshape(1, -1), n_neighbors=11)
    
    # Gets the titles of the 10 most similar recipes (excluding the input recipe itself)
    similar_recipes = [
        (df.iloc[i]["title"], dist[0][j])
        for j, i in enumerate(ind[0])
        if i != recipe_index
    ]

    # Returns the top 10 most similar recipes
    return similar_recipes[:10]

#test
recipe_title = "Anchovy and sage crisps"
recommendations = knn_similarity(recipe_title, df)

# Prints the recommendations
print(f"\nRecommendations for '{recipe_title}':")
for i, (title, distance) in enumerate(recommendations, start=1):
    print(f"{i}. {title} (Similarity: {1 - distance:.2f})")

#3.3

u1_recipes = []
u2_recipes = []
u3_recipes = []
u4_recipes = []

#Test
recipe_title = "Chicken tikka masala"
recommendations = knn_similarity(recipe_title, df)
for i in recommendations:
    u1_recipes.append(i[0])

# Print the recommendations
print(f"\nKNN - Recommendations for '{recipe_title}':")
for i, (title) in enumerate(recommendations, start=1):
    print(f"{title[0]}")

#Test
recipe_title = "Albanian baked lamb with rice"
recommendations = knn_similarity(recipe_title, df)
for i in recommendations:
    u2_recipes.append(i[0])

# Print the recommendations
print(f"\nKNN - Recommendations for '{recipe_title}':")
for i, (title) in enumerate(recommendations, start=1):
    print(f"{title}")


#Test
recipe_title = "Baked salmon with chorizo rice"
recommendations = knn_similarity(recipe_title, df)
for i in recommendations:
    u3_recipes.append(i[0])

# Print the recommendations
print(f"\nKNN - Recommendations for '{recipe_title}':")
for i, (title) in enumerate(recommendations, start=1):
    print(f"{title[0]}")

#Test
recipe_title = "Almond lentil stew"
recommendations = knn_similarity(recipe_title, df)
for i in recommendations:
    u4_recipes.append(i[0])

# Print the recommendations
print(f"\nKNN - Recommendations for '{recipe_title}':")
for i, (title) in enumerate(recommendations, start=1):
    print(f"{title[0]}")
  
recipes = []
with open('recipes.csv', 'r') as read_obj: 
    csv_reader = csv.reader(read_obj) 
  
    # convert string to list 
    list_of_csv = list(csv_reader) 
  
    for i in list_of_csv:
        recipes.append(i[2])

recipes.pop(0)
print(recipes)

U1 = [1 if recipe in u1_recipes else 0 for recipe in recipes]
U2 = [1 if recipe in u2_recipes else 0 for recipe in recipes]
U3 = [1 if recipe in u3_recipes else 0 for recipe in recipes]
U4 = [1 if recipe in u4_recipes else 0 for recipe in recipes]
cosine_similarity_matrix = cosine_similarity([U1, U2, U3, U4])
print(cosine_similarity_matrix)

num_users = 4
total_similarity = 0
pairwise_count = 0

for i in range(num_users):
    for j in range(i + 1, num_users):
        total_similarity += cosine_similarity_matrix[i][j]
        pairwise_count += 1

# Compute average pairwise similarity
average_similarity = total_similarity / pairwise_count

print(f"Average Pairwise Similarity: {average_similarity:.4f}")

# Evaluation:
# KNN:
# Using the test set provided with about 500 recommendations for each test, only 1501 recommendations are provided as “Albanian baked lamb with rice” generated no recommendations. 1103 of the recommended items were unique leading to a coverage of – (1102/3293) * 100 = 33.5% to 3 significant figures. This low value indicates that there is low coverage meaning that many recipes are not recommended, suggesting that the model is narrow or biased towards certain recipes. 

# In terms of personalization of the knn algorithm, using the same test cases, an average pairwise similarity value of 0.153 (to 3 significant figures) was obtained. This extremely low average similarity value indicates that the algorithm provides quite a high personalization for each user. This was done using 500 recommendations per user. However, when generating 3000 recommendations per user, a higher average similarity value of 0.458 was obtained. This value is still relatively low, so the algorithm is still quite personalized. However, judging from these changes in values, it appears as though; the higher the number of recommendations, the less personalized the algorithm will become.

# Vector Space Method:
# Using the same test set provided, I end up getting the exact same coverage of – (1102/3293) * 100 = 33.5% to 3 significant figures. No recommendations were provided for “Albanian baked lamb with rice” in this system as well. Additionally, the same number of unique results were provided. Not only that, but the same number of recommendations (including duplicates) were produced. Following this result, I decided to increase the number of recommendations (750 per user) to get a broader range of results, and I ended up with (1399/3293) * 100 = 42.5%. This indicates an increase in coverage as the number of recommendations increases, meaning there is better diversity, and a larger portion of the recipes are recommended. 

# Once again, similarly to coverage, the same values as the knn algorithm were obtained for personalization. Using 500 recommendations per user, an average similarity value of 0.153 (to 3 s.f.) was obtained and 0.458 for 3000 recommendations per user. Same conclusion as knn algorithm. Vector space algorithm will most likely become less personalized as the number of recommendations increases. 


#3.4

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests as r
import pandas as pd
from pandas import DataFrame
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer


#Creating a dataframe from the CSV file
df = pd.read_csv('recipes.csv')

#Changing values of column
df["rating_avg"] = np.where(df['rating_avg'] > 4.2, 1, 0)

features=['title','rating_avg','rating_val','total_time','category','cuisine', 'ingredients']

x = df[features]
y = df["rating_avg"]

# Combine features
df["combined_features"] = df[["category", "cuisine", "ingredients"]].apply(lambda x: " ".join(x), axis=1)

#Vectorize data
vectorizer = CountVectorizer(max_features=5000)
x_cat = vectorizer.fit_transform(df['combined_features']).toarray()

# Scale numerical features
scaler = StandardScaler()
x_num = scaler.fit_transform(df[["rating_avg", "total_time"]])

# Combine numerical and categorical features
x_combined = np.hstack((x_num, x_cat))

# Splitting into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_combined, y, test_size=0.25, random_state=42, stratify=y)

# Creating and training prediction model
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train, y_train)

# Form predictions
y_pred = model.predict(x_test)

# Model evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Evaluation (Accuracy):
# Using the knn algorithm and sklearn’s metric accuracy score, I ended up with an accuracy value of 0.993 to 3 significant figures. This value indicates an almost 100% correct prediction ratio which means that out all the recommendations, practically all of them are correct recommendations.




Recommendations for 'Chicken and Coconut Curry':
1. Vegan blackeye bean curry - https://www.bbc.co.uk/food/recipes/aromaticblackeyebean_73019
2. Indian fish curry - https://www.bbc.co.uk/food/recipes/greencoconutfishcurr_86736
3. Indian-spiced lamb shoulder with Bombay potatoes - https://www.bbc.co.uk/food/recipes/indian-spiced_lamb_48529
4. Lamb madras with bombay potatoes - https://www.bbc.co.uk/food/recipes/lamb_madras_with_bombay_09160
5. Jerk chicken thighs - https://www.bbc.co.uk/food/recipes/spicyjerkchickenthig_89120
6. Roast chicken thighs with lentils and mint yoghurt - https://www.bbc.co.uk/food/recipes/roast_spiced_chicken_75133
7. Softened sweet onion and crisp fried fish - https://www.bbc.co.uk/food/recipes/softened_sweet_onion_and_80481
8. Chicken jalfrezi - https://www.bbc.co.uk/food/recipes/chickenjalfrezi_91772
9. Dry curry of cabbage, carrot and coconut (Thoran)  - https://www.bbc.co.uk/food/recipes/dry_curry_of_cabbage_71527
10. Indian chicken stew - https://www.bbc

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 5983: character maps to <undefined>