Reading dataset recipes and filtering out useless columns :

In [1]:
import pandas as pd

data=pd.read_csv(r"recipes.csv")

#Filtering columns based on requirement :-
cols_filtered_data=data[["RecipeId", "Name", "AuthorId", "AuthorName", "CookTime", "PrepTime",
        "DatePublished", "RecipeCategory", "AggregatedRating", "ReviewCount", 
        "Calories", "FatContent", "SaturatedFatContent", "CholesterolContent", "SodiumContent", 
        "CarbohydrateContent", "FiberContent", "SugarContent", "ProteinContent", "RecipeServings"]]

Removing Trash rows

In [2]:
cols_filtered_data=cols_filtered_data.drop(index=range(1438, 1488))

Removing columns not having a rating assigned or lesser than 3 ratings :

In [3]:

#Removing columns without rating :-
cols_filtered_data=cols_filtered_data.dropna(subset=["AggregatedRating"])
cols_filtered_data=cols_filtered_data[cols_filtered_data["ReviewCount"]>3]

Choosing diverse rating dishes :

In [4]:
#Choosing mixed rating dishes :-
rating_above_45=cols_filtered_data[cols_filtered_data["AggregatedRating"]>=4.5][0:10000]
rating_below_4=cols_filtered_data[cols_filtered_data["AggregatedRating"]<4.5][0:33000]
cols_filtered_data=pd.concat([rating_above_45,rating_below_4],axis=0)

Changing date-time to only have year :

In [5]:
#Modifying column to store date in required format :- (Only year of publish)
cols_filtered_data["DatePublished"]=cols_filtered_data["DatePublished"].str[0:4]

Replacing blank and wrong data :

In [6]:


#Replacing all blank entries in columns with default data
for i in cols_filtered_data:
    if i not in ["RecipeId", "Name", "AuthorId", "AuthorName","DatePublished", "RecipeCategory","RecipeServings"]:
        cols_filtered_data[i].fillna(0, inplace=True)
    elif i=="RecipeServings":
        cols_filtered_data[i].fillna(-1, inplace=True)   #-1 code for serving parameter not applicable. Eg: items like sauce

Creating function for converting data like 2H2M to 122 mins :

In [7]:
def time_to_mins(strig):
    if strig==0:
        return 0
    else:
        a=0
        b=0
        if strig[1]=="H":
            a=int(strig[0])
        elif len(strig)>2 and strig[2]=="H":
            a=int(strig[0:2])
        if strig[-1]=="M":
            if len(strig)>2 and strig[-3].isdigit():
                b=int(strig[-3:-1])
            else:
                b=int(strig[-2])
        return a*60+b

Converting Prep time and Cook time to minutes

In [8]:
#Converting PrepTime and CookTime column entries to numerical format :-
cols_filtered_data["PrepTime"]=cols_filtered_data["PrepTime"].str.lstrip("PT")
cols_filtered_data["CookTime"]=cols_filtered_data["CookTime"].str.lstrip("PT")
cols_filtered_data["PrepTime"].fillna(0,inplace=True)
cols_filtered_data["CookTime"].fillna(0,inplace=True)
cols_filtered_data["PrepTime"].replace("0S",0,inplace=True)
cols_filtered_data["CookTime"].replace("0S",0,inplace=True)
cols_filtered_data["PrepTime"]=cols_filtered_data["PrepTime"].apply(time_to_mins)
cols_filtered_data["CookTime"]=cols_filtered_data["CookTime"].apply(time_to_mins)

Removing duplicate name entries :

In [9]:

#Removing duplicate items :-
final_recipes_dataset=cols_filtered_data.drop_duplicates(subset=["Name"])

Pasting processed recipe dataset into new file :

In [10]:
# cols_filtered_data.to_csv(r"C:\Users\nishi\Vs_Code_Projects\IIITD\SEM 4\SML\Project\Processed_Recipes_Dataset.csv",index=False)

Reading dataset reviews and filtering out useless columns :

In [11]:
data=pd.read_csv(r"reviews.csv")

#Filtering columns based on requirement :-
recipes_dataset=data[["ReviewId","RecipeId", "AuthorId", "AuthorName", "Rating"]]
recipes_dataset=recipes_dataset.rename(columns={"AuthorId":"ReviewerID", "AuthorName":"ReviewerName"})

Merging datasets based on ReviewId :

In [12]:
final_processed_dataset=pd.merge(final_recipes_dataset,recipes_dataset,on="RecipeId")

Pasting Final processed data into new file

In [13]:
final_processed_dataset.to_csv(r"Final_Processed_Dataset.csv",index=False)