Filter recipes list for those that contain an image

In [1]:
# Install libraries locally
!pip3 install pandas
!pip3 install -U scikit-learn scipy matplotlib
!pip3 install scikit-learn
!pip3 install apyori
!pip3 install requests
!pip3 install bs4

Collecting scikit-learn
  Using cached scikit_learn-1.4.1.post1-cp39-cp39-macosx_10_9_x86_64.whl (11.6 MB)
Collecting scipy
  Downloading scipy-1.12.0-cp39-cp39-macosx_10_9_x86_64.whl (38.9 MB)
[K     |████████████████████████████████| 38.9 MB 76 kB/s eta 0:00:0101
Collecting matplotlib
  Downloading matplotlib-3.8.3-cp39-cp39-macosx_10_12_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 24.0 MB/s eta 0:00:01
Installing collected packages: scipy, scikit-learn, matplotlib
  Attempting uninstall: scipy
    Found existing installation: scipy 1.11.4
    Uninstalling scipy-1.11.4:
      Successfully uninstalled scipy-1.11.4
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.4.0
    Uninstalling scikit-learn-1.4.0:
      Successfully uninstalled scikit-learn-1.4.0
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.8.2
    Uninstalling matplotlib-3.8.2:
      Successfully uninstalled matplotlib-3.8.2
Success

In [2]:
import pandas as pd
import pickle as pkl

In [3]:
original_df = pd.read_csv("./RAW_recipes.csv")
df = original_df.copy()
df['description'] = df['description'].fillna('')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     231637 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [5]:
# sort by date of submission and take only the first 1000 recipes for speed of model creation
df.sort_values('submitted', inplace=True, ascending=False)
df = df.iloc[:10000].reset_index(drop=True)
df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,mini buffalo chicken cheesesteaks,537716,40,2001975627,2018-12-04,"['60-minutes-or-less', 'time-to-make', 'course...","[407.9, 34.0, 21.0, 49.0, 28.0, 64.0, 12.0]",12,['heat 1 tablespoon of olive oil in a skillet ...,these party-style chicken cheesesteaks are fla...,"['olive oil', 'green bell pepper', 'yellow oni...",13
1,nutcracker peppermint red velvet cake pops,537671,135,2002198506,2018-11-28,"['time-to-make', 'course', 'preparation', 'occ...","[207.9, 12.0, 93.0, 10.0, 6.0, 8.0, 10.0]",54,"['before you begin , you will need to gather t...",rich red velvet cake combines with cool pepper...,"[""devil's food cake mix"", 'eggs', 'buttermilk'...",12
2,moist gingerbread cake,537543,55,2001201872,2018-11-16,"['60-minutes-or-less', 'time-to-make', 'course...","[1617.0, 104.0, 213.0, 8.0, 40.0, 203.0, 80.0]",8,"['preheat the oven to 350&deg', 'f grease a lo...",a slightly sticky loaf cake flavoured with gin...,"['unsalted butter', 'applesauce', 'egg', 'unsu...",10
3,5 ingredient salted caramel crumble bars,537485,45,2000378667,2018-11-12,"['60-minutes-or-less', 'time-to-make', 'course...","[52.8, 3.0, 0.0, 4.0, 1.0, 1.0, 2.0]",21,"['1', 'heat oven to 350f spray 8-inch square p...",delicious,"['pillsbury sugar cookie dough', 'caramel topp...",5
4,bailey s chocotini,537459,10,400708,2018-11-10,"['15-minutes-or-less', 'time-to-make', 'course...","[220.7, 15.0, 49.0, 2.0, 3.0, 30.0, 4.0]",5,"['to layer: add chocolate liqueur to glass', '...",a recipe that recipe complements the cocoa in ...,"['baileys irish cream', 'chocolate liqueur', '...",3


Make sure to filter out images with 'recipe-default-images' in URL (means no image was uploaded)

In [15]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from requests.exceptions import Timeout
from PIL import Image as PILImage
from io import BytesIO

name = df.iloc[0]['name'].replace(' ', '-')
id = df.iloc[0]['id']
testurl = f"https://www.food.com/recipe/{name}-{id}"


def image_checker(url, timeout=10, threshold_dimensions=(75, 75), threshold_file_size_kb=15):

    try:
        resp = requests.get(url, timeout=timeout)
    except Timeout:
        print(f'Timeout for URL: {url}')
        return False
    

    if resp.status_code == 200:
        parsedText = BeautifulSoup(resp.text, 'html.parser')
        image_tag = parsedText.find('img', class_= 'only-desktop svelte-kb6fq')
        if image_tag:
            image_url = image_tag['src']
            abs_image_url = urljoin(url, image_url)

            try:
                image_resp = requests.get(abs_image_url, timeout=timeout)
            except Timeout:
                print(f'Timeout for image: ({abs_image_url})')
                return False

            image_resp = requests.get(abs_image_url)

            if image_resp.status_code == 200:
                # display(Image(image_resp.content))
                # print('Success')

                try:
                    # Open the image using PIL to get dimensions and size
                    img = PILImage.open(BytesIO(image_resp.content))
                    width, height = img.size
                    file_size_kb = len(image_resp.content) / 1024

                    # Check if the image meets the quality criteria
                    if width >= threshold_dimensions[0] and height >= threshold_dimensions[1] and file_size_kb >= threshold_file_size_kb:
                        return abs_image_url
                    else:
                        return False
                except Exception as e:
                    print(f"Error processing image: {e}")
                    return False

            else:
                # print('Cannot download image')
                return False
        else:
            # print("No image with specified class")
            return False
    else:
        # print("Failed to retrieve webpage")
        return False





In [19]:
# split into lists of one thousand to avoid timeout error
img_list_1 = [] 

for i in df.itertuples():
    
    index = i.Index
    if index < 1000: 
        name = i.name.replace(' ', '-')
        id = i.id
        testurl = f"https://www.food.com/recipe/{name}-{id}"
        result = image_checker(testurl)
        img_list_1.append({index: result})
        print(index)

print('Done')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [21]:
# split into lists of one thousand to avoid timeout error
img_list_2 = [] 

for i in df.itertuples():
    
    index = i.Index
    if index >= 1000 and index < 2000: 
        name = i.name.replace(' ', '-')
        id = i.id
        testurl = f"https://www.food.com/recipe/{name}-{id}"
        result = image_checker(testurl)
        img_list_2.append({index: result})
        print(index)

print('Done')

1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199


In [22]:
# split into lists of one thousand to avoid timeout error
img_list_3 = [] 

for i in df.itertuples():
    
    index = i.Index
    if index >= 2000 and index < 3000: 
        name = i.name.replace(' ', '-')
        id = i.id
        testurl = f"https://www.food.com/recipe/{name}-{id}"
        result = image_checker(testurl)
        img_list_3.append({index: result})
        print(index)

print('Done')

2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199


In [23]:
# split into lists of one thousand to avoid timeout error
img_list_4 = [] 

for i in df.itertuples():
    
    index = i.Index
    if index >= 3000 and index < 4000: 
        name = i.name.replace(' ', '-')
        id = i.id
        testurl = f"https://www.food.com/recipe/{name}-{id}"
        result = image_checker(testurl)
        img_list_4.append({index: result})
        print(index)

print('Done')

3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199


In [24]:
# split into lists of one thousand to avoid timeout error
img_list_5 = [] 

for i in df.itertuples():
    
    index = i.Index
    if index >= 4000 and index < 5000: 
        name = i.name.replace(' ', '-')
        id = i.id
        testurl = f"https://www.food.com/recipe/{name}-{id}"
        result = image_checker(testurl)
        img_list_5.append({index: result})
        print(index)

print('Done')

4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199


In [25]:
# split into lists of one thousand to avoid timeout error
img_list_6 = [] 

for i in df.itertuples():
    
    index = i.Index
    if index >= 5000 and index < 6000: 
        name = i.name.replace(' ', '-')
        id = i.id
        testurl = f"https://www.food.com/recipe/{name}-{id}"
        result = image_checker(testurl)
        img_list_6.append({index: result})
        print(index)

print('Done')

5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199


In [26]:
# split into lists of one thousand to avoid timeout error
img_list_7 = [] 

for i in df.itertuples():
    
    index = i.Index
    if index >= 6000 and index < 7000: 
        name = i.name.replace(' ', '-')
        id = i.id
        testurl = f"https://www.food.com/recipe/{name}-{id}"
        result = image_checker(testurl)
        img_list_7.append({index: result})
        print(index)

print('Done')

6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199


In [28]:
# split into lists of one thousand to avoid timeout error
img_list_8 = [] 

for i in df.itertuples():
    
    index = i.Index
    if index >= 7000 and index < 8000: 
        name = i.name.replace(' ', '-')
        id = i.id
        testurl = f"https://www.food.com/recipe/{name}-{id}"
        result = image_checker(testurl)
        img_list_8.append({index: result})
        print(index)

print('Done')

7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199


In [29]:
# split into lists of one thousand to avoid timeout error
img_list_9 = [] 

for i in df.itertuples():
    
    index = i.Index
    if index >= 8000 and index < 9000: 
        name = i.name.replace(' ', '-')
        id = i.id
        testurl = f"https://www.food.com/recipe/{name}-{id}"
        result = image_checker(testurl)
        img_list_9.append({index: result})
        print(index)

print('Done')

8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199


In [30]:
# split into lists of one thousand to avoid timeout error
img_list_10 = [] 

for i in df.itertuples():
    
    index = i.Index
    if index >= 9000 and index < 10000: 
        name = i.name.replace(' ', '-')
        id = i.id
        testurl = f"https://www.food.com/recipe/{name}-{id}"
        result = image_checker(testurl)
        img_list_10.append({index: result})
        print(index)

print('Done')

9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199


In [31]:
all_imgs_list = img_list_1 + img_list_2 + img_list_3 + img_list_4 + img_list_5 + img_list_6 + img_list_7 + img_list_8 + img_list_9 + img_list_10

print(all_imgs_list)



In [47]:
all_imgs_list[0].values()

dict_values(['https://img.sndimg.com/food/image/upload/f_auto,c_thumb,q_55,w_860,ar_3:2/v1/img/recipes/53/77/16/lngBqHhRRiOlsyiofM3n_DSC04762-2.jpg'])

In [53]:
# Convert the list of dictionaries to a DataFrame
imgs_df = pd.DataFrame(list(all_imgs_list.values()) for all_imgs_list in all_imgs_list)
imgs_df.columns = ['URL']

# # Set the index to the 'Index' column
# imgs_df.set_index('Index', inplace=True)

# Display the DataFrame
imgs_df[imgs_df['URL'] == False]

Unnamed: 0,URL
0,https://img.sndimg.com/food/image/upload/f_aut...
1,https://img.sndimg.com/food/image/upload/f_aut...
2,https://img.sndimg.com/food/image/upload/f_aut...
3,https://img.sndimg.com/food/image/upload/f_aut...
4,https://img.sndimg.com/food/image/upload/f_aut...


In [55]:
imgs_df[imgs_df['URL'] == False].size

50

In [57]:
df['URL'] = imgs_df['URL']

In [60]:
# subset to only those recipes with an associated image, reset index
df_subset = df[df["URL"] != False]
df_subset.reset_index(drop=True, inplace=True)
df_subset.tail()


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,URL
9945,momma jan s good for what ails ya chicken soup,475074,65,1688468,2012-02-26,"['weeknight', 'time-to-make', 'course', 'main-...","[463.1, 29.0, 21.0, 10.0, 60.0, 27.0, 15.0]",16,"['bring chicken broth , water , and chicken ba...","ready, set, cook! special edition contest entr...","['low sodium chicken broth', 'water', 'chicken...",13,https://img.sndimg.com/food/image/upload/f_aut...
9946,botswanan chicken groundnut stew,475060,80,226867,2012-02-26,"['time-to-make', 'course', 'main-ingredient', ...","[509.8, 53.0, 73.0, 20.0, 54.0, 39.0, 9.0]",10,"['prepare the sauce by combining the sugar , c...",this traditional recipe from botswana is for a...,"['chicken thighs', 'vegetable oil', 'onion', '...",11,https://img.sndimg.com/food/image/upload/f_aut...
9947,how to peel a head of garlic in less than 10 s...,475026,1,527607,2012-02-26,"['15-minutes-or-less', 'time-to-make', 'course...","[89.4, 0.0, 2.0, 0.0, 7.0, 0.0, 6.0]",4,['smash the head of garlic with the heel of yo...,here is an unbelievable way to peel a whole he...,"['garlic', 'water']",2,https://img.sndimg.com/food/image/upload/f_aut...
9948,mushroom and sausage bake,475029,40,1196284,2012-02-26,"['60-minutes-or-less', 'time-to-make', 'course...","[354.2, 36.0, 11.0, 38.0, 35.0, 50.0, 5.0]",10,"['brown sausage on stove', 'drain grease , add...","add ingredients that you have on hand--bacon, ...","['bob evans sausage', 'monterey jack cheese', ...",9,https://img.sndimg.com/food/image/upload/f_aut...
9949,bacon goat cheese potatoes with sundried tom...,475041,25,2192562,2012-02-26,"['weeknight', '30-minutes-or-less', 'time-to-m...","[80.3, 11.0, 6.0, 4.0, 2.0, 19.0, 0.0]",6,"['preheat oven to 350 degrees', 'in a large bo...","ready, set, cook! special edition contest entr...",['simply potatoes traditional mashed potatoes'...,10,https://img.sndimg.com/food/image/upload/f_aut...


In [61]:
df_subset.to_csv('recipe_w_images.csv')

In [62]:
with open('./df_subset.pkl', 'wb') as f:
    pkl.dump(df_subset, f)