In [1]:
print('DATA CLEANING')


DATA CLEANING


In [2]:
# Web Scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes vegetable data from https://www.medicalnewstoday.com/
def url_to_transcript(url):
    '''Returns vegetable data specifically from https://www.medicalnewstoday.com/'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="css-z468a2").find_all('p')]
    print(url)
    return text

# URLs of vegetables in scope
urls = ['https://www.medicalnewstoday.com/articles/270609',
        'https://www.medicalnewstoday.com/articles/270435',
        'https://www.medicalnewstoday.com/articles/266765',
        'https://www.medicalnewstoday.com/articles/327090',
        'https://www.medicalnewstoday.com/articles/281438',
        'https://www.medicalnewstoday.com/articles/311343',
        'https://www.medicalnewstoday.com/articles/270191',
        'https://www.medicalnewstoday.com/articles/284823#1',
        'https://www.medicalnewstoday.com/articles/273031',
        'https://www.medicalnewstoday.com/articles/265853',
        'https://www.medicalnewstoday.com/articles/276714',
        'https://www.medicalnewstoday.com/articles/284765',
        'https://www.medicalnewstoday.com/articles/323916',
        'https://www.medicalnewstoday.com/articles/282844']

# Vegetable names
##vegetable = ['6', '3','8','7','14','10','2','11','4','13','1','12','5','9']
vegetable = ['spinach', 'kale', 'broccoli', 'peas', 'sweet potatoes', 'beets', 'carrots', 'cabbage', 'tomato', 'garlic', 'onions', 'sprouts', 'seaweed', 'cauliflower']

In [3]:
#Request transcripts (takes a few minutes to run)
transcripts = [url_to_transcript(u) for u in urls]

https://www.medicalnewstoday.com/articles/270609
https://www.medicalnewstoday.com/articles/270435
https://www.medicalnewstoday.com/articles/266765
https://www.medicalnewstoday.com/articles/327090
https://www.medicalnewstoday.com/articles/281438
https://www.medicalnewstoday.com/articles/311343
https://www.medicalnewstoday.com/articles/270191
https://www.medicalnewstoday.com/articles/284823#1
https://www.medicalnewstoday.com/articles/273031
https://www.medicalnewstoday.com/articles/265853
https://www.medicalnewstoday.com/articles/276714
https://www.medicalnewstoday.com/articles/284765
https://www.medicalnewstoday.com/articles/323916
https://www.medicalnewstoday.com/articles/282844


In [4]:
# Pickle files for later use

# Make a new directory to hold the text files
#!mkdir transcripts

#for i, c in enumerate(vegetable):
#     with open("transcripts/" + c + ".txt", "wb") as file:
#        pickle.dump(transcripts[i], file)

In [5]:
# Load pickled files
data = {}
for i, c in enumerate(vegetable):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [6]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['spinach', 'kale', 'broccoli', 'peas', 'sweet potatoes', 'beets', 'carrots', 'cabbage', 'tomato', 'garlic', 'onions', 'sprouts', 'seaweed', 'cauliflower'])

In [7]:
# More checks
data['spinach'][:2]

['Spinach is a superfood. It is loaded with tons of nutrients in a low-calorie package. Dark, leafy greens like spinach are important for skin, hair, and bone health. They also provide protein, iron, vitamins, and minerals.',
 'The possible health benefits of consuming spinach include improving blood glucose control in people with diabetes, lowering the risk of cancer, and improving bone health, as well as supplying minerals and vitamins that can provide a range of different']

In [8]:
##Common data cleaning steps on all text:

##Make text all lower case
##Remove punctuation
##Remove numerical values
##Remove common non-sensical text (/n)
##Tokenize text
##Remove stop words

In [9]:
# Let's take a look at our data again
next(iter(data.keys()))

'spinach'

In [10]:
# Notice that our dictionary is currently in key: vegetable, value: list of text format
next(iter(data.values()))

['Spinach is a superfood. It is loaded with tons of nutrients in a low-calorie package. Dark, leafy greens like spinach are important for skin, hair, and bone health. They also provide protein, iron, vitamins, and minerals.',
 'The possible health benefits of consuming spinach include improving blood glucose control in people with diabetes, lowering the risk of cancer, and improving bone health, as well as supplying minerals and vitamins that can provide a range of different',
 'Spinach has been used by various cultures throughout history, notably in Mediterranean, Middle-Eastern, and South-East-Asian cuisines. It can be incorporated quite easily into any diet, as it is cheap and easy to prepare.',
 'This article explores the nutrition contained in spinach, how it can benefit the body, and a range of flavorsome ways to include this in the diet.',
 'One cup of raw spinach contains:',
 'Spinach also contains vitamin K, fiber, phosphorus, and thiamine. Most of the calories in spinach come

In [11]:
# We are going to change this to key: vegetable, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [12]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [13]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
beets,"People often describe beetroot as a superfood and have used it for centuries to treat fever, constipation, and skin complaints. Researchers are no..."
broccoli,Broccoli has a reputation as a superfood. It is low in calories but contains a wealth of nutrients and antioxidants that support many aspects of h...
cabbage,"Cabbage, which is often lumped into the same category as lettuce because of their similar appearance, is actually a part of the cruciferous vegeta..."
carrots,"We include products we think are useful for our readers. If you buy through links on this page, we may earn a small commission. Here’s our process..."
cauliflower,Cauliflower is a cruciferous vegetable that is naturally high in fiber and B-vitamins. It provides antioxidants and phytonutrients that can protec...
garlic,"Garlic (Allium sativum), is used widely as a flavoring in cooking, but it has also been used as a medicine throughout ancient and modern history; ..."
kale,"We include products we think are useful for our readers. If you buy through links on this page, we may earn a small commission. Here’s our process..."
onions,"We include products we think are useful for our readers. If you buy through links on this page, we may earn a small commission. Here’s our process..."
peas,"A recent review and meta-analysis focus on the role of legumes in heart health. Taking data from multiple studies and earlier analyses, the author..."
seaweed,"Seaweed grows in or near salty waters. There are several types, and they generally contain many healthful minerals that are easy for the body to b..."


In [14]:
# Let's take a look at the transcript for vegetable Spinach
data_df.transcript.loc['spinach']

'Spinach is a superfood. It is loaded with tons of nutrients in a low-calorie package. Dark, leafy greens like spinach are important for skin, hair, and bone health. They also provide protein, iron, vitamins, and minerals. The possible health benefits of consuming spinach include improving blood glucose control in people with diabetes, lowering the risk of cancer, and improving bone health, as well as supplying minerals and vitamins that can provide a range of different Spinach has been used by various cultures throughout history, notably in Mediterranean, Middle-Eastern, and South-East-Asian cuisines. It can be incorporated quite easily into any diet, as it is cheap and easy to prepare. This article explores the nutrition contained in spinach, how it can benefit the body, and a range of flavorsome ways to include this in the diet. One cup of raw spinach contains: Spinach also contains vitamin K, fiber, phosphorus, and thiamine. Most of the calories in spinach come from protein and car

In [15]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [16]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
beets,people often describe beetroot as a superfood and have used it for centuries to treat fever constipation and skin complaints researchers are now i...
broccoli,broccoli has a reputation as a superfood it is low in calories but contains a wealth of nutrients and antioxidants that support many aspects of hu...
cabbage,cabbage which is often lumped into the same category as lettuce because of their similar appearance is actually a part of the cruciferous vegetabl...
carrots,we include products we think are useful for our readers if you buy through links on this page we may earn a small commission here’s our process so...
cauliflower,cauliflower is a cruciferous vegetable that is naturally high in fiber and bvitamins it provides antioxidants and phytonutrients that can protect ...
garlic,garlic allium sativum is used widely as a flavoring in cooking but it has also been used as a medicine throughout ancient and modern history it ha...
kale,we include products we think are useful for our readers if you buy through links on this page we may earn a small commission here’s our process ka...
onions,we include products we think are useful for our readers if you buy through links on this page we may earn a small commission here’s our process on...
peas,a recent review and metaanalysis focus on the role of legumes in heart health taking data from multiple studies and earlier analyses the authors c...
seaweed,seaweed grows in or near salty waters there are several types and they generally contain many healthful minerals that are easy for the body to bre...


In [17]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [18]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
beets,people often describe beetroot as a superfood and have used it for centuries to treat fever constipation and skin complaints researchers are now i...
broccoli,broccoli has a reputation as a superfood it is low in calories but contains a wealth of nutrients and antioxidants that support many aspects of hu...
cabbage,cabbage which is often lumped into the same category as lettuce because of their similar appearance is actually a part of the cruciferous vegetabl...
carrots,we include products we think are useful for our readers if you buy through links on this page we may earn a small commission heres our process som...
cauliflower,cauliflower is a cruciferous vegetable that is naturally high in fiber and bvitamins it provides antioxidants and phytonutrients that can protect ...
garlic,garlic allium sativum is used widely as a flavoring in cooking but it has also been used as a medicine throughout ancient and modern history it ha...
kale,we include products we think are useful for our readers if you buy through links on this page we may earn a small commission heres our process kal...
onions,we include products we think are useful for our readers if you buy through links on this page we may earn a small commission heres our process oni...
peas,a recent review and metaanalysis focus on the role of legumes in heart health taking data from multiple studies and earlier analyses the authors c...
seaweed,seaweed grows in or near salty waters there are several types and they generally contain many healthful minerals that are easy for the body to bre...


In [19]:
print('CORPUS')

CORPUS


In [20]:

# Let's take a look at our dataframe
data_df

Unnamed: 0,transcript
beets,"People often describe beetroot as a superfood and have used it for centuries to treat fever, constipation, and skin complaints. Researchers are no..."
broccoli,Broccoli has a reputation as a superfood. It is low in calories but contains a wealth of nutrients and antioxidants that support many aspects of h...
cabbage,"Cabbage, which is often lumped into the same category as lettuce because of their similar appearance, is actually a part of the cruciferous vegeta..."
carrots,"We include products we think are useful for our readers. If you buy through links on this page, we may earn a small commission. Here’s our process..."
cauliflower,Cauliflower is a cruciferous vegetable that is naturally high in fiber and B-vitamins. It provides antioxidants and phytonutrients that can protec...
garlic,"Garlic (Allium sativum), is used widely as a flavoring in cooking, but it has also been used as a medicine throughout ancient and modern history; ..."
kale,"We include products we think are useful for our readers. If you buy through links on this page, we may earn a small commission. Here’s our process..."
onions,"We include products we think are useful for our readers. If you buy through links on this page, we may earn a small commission. Here’s our process..."
peas,"A recent review and meta-analysis focus on the role of legumes in heart health. Taking data from multiple studies and earlier analyses, the author..."
seaweed,"Seaweed grows in or near salty waters. There are several types, and they generally contain many healthful minerals that are easy for the body to b..."


In [21]:
# Let's add the vegetables' fullnames as well

fullnames = ['Beetroot', 'Brassica oleracea var. italica', 'Brassica oleracea var. capitata', 'Daucus carota subsp. sativus', 'Brassica oleracea var. botrytis', 'Allium sativum', 
            'Kale', 'Allium cepa', 'Pisum sativum', 'Seaweed', 'Spinacia oleracea', 'Sprouts', 'Ipomoea batatas', 'Solanum lycopersicum']
data_df['full_names'] = fullnames
data_df

Unnamed: 0,transcript,full_names
beets,"People often describe beetroot as a superfood and have used it for centuries to treat fever, constipation, and skin complaints. Researchers are no...",Beetroot
broccoli,Broccoli has a reputation as a superfood. It is low in calories but contains a wealth of nutrients and antioxidants that support many aspects of h...,Brassica oleracea var. italica
cabbage,"Cabbage, which is often lumped into the same category as lettuce because of their similar appearance, is actually a part of the cruciferous vegeta...",Brassica oleracea var. capitata
carrots,"We include products we think are useful for our readers. If you buy through links on this page, we may earn a small commission. Here’s our process...",Daucus carota subsp. sativus
cauliflower,Cauliflower is a cruciferous vegetable that is naturally high in fiber and B-vitamins. It provides antioxidants and phytonutrients that can protec...,Brassica oleracea var. botrytis
garlic,"Garlic (Allium sativum), is used widely as a flavoring in cooking, but it has also been used as a medicine throughout ancient and modern history; ...",Allium sativum
kale,"We include products we think are useful for our readers. If you buy through links on this page, we may earn a small commission. Here’s our process...",Kale
onions,"We include products we think are useful for our readers. If you buy through links on this page, we may earn a small commission. Here’s our process...",Allium cepa
peas,"A recent review and meta-analysis focus on the role of legumes in heart health. Taking data from multiple studies and earlier analyses, the author...",Pisum sativum
seaweed,"Seaweed grows in or near salty waters. There are several types, and they generally contain many healthful minerals that are easy for the body to b...",Seaweed


In [22]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")


In [23]:
print('Document-Term Matrix')

Document-Term Matrix


In [24]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,ability,able,abnormally,absorb,absorbed,absorbing,absorbs,absorption,academic,accompaniment,...,yams,year,yearround,years,yellow,yes,young,younger,zeaxanthin,zinc
beets,2,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
broccoli,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
cabbage,2,2,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
carrots,0,0,0,2,0,0,1,0,0,0,...,0,2,0,1,5,1,0,0,2,0
cauliflower,0,0,0,0,0,0,0,2,0,0,...,0,0,0,1,0,0,0,0,0,0
garlic,0,0,1,0,0,0,0,0,1,0,...,0,1,0,3,0,0,0,0,0,0
kale,0,0,0,1,0,2,0,0,0,0,...,0,1,0,0,0,0,0,1,1,1
onions,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
peas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
seaweed,0,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [26]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))

In [27]:
print('DATA CLEANING DONE')
print('BY : BENEDICTO')

DATA CLEANING DONE
BY : BENEDICTO
