# Project Web Scraping

## Web scraping

In [3]:
#importing relevant libraries 
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [4]:
#sending an http request to the web page
url="https://www.prepscholar.com/toefl/blog/toefl-vocabulary-list/"
data=requests.get(url).text

In [5]:
#parse the html content
soup=BeautifulSoup(data,"html5lib")

In [6]:
#creating an empty new data frame where our toefl words are gonna be 
toefl_vocabulary=pd.DataFrame(columns=["Word","Definition","Sample Sentence"])

In [11]:
#splitting the word in 3 columns. Using the html syntax (<tr>,<td> and <tbody>) to performance it.
for row in soup.find("tbody").find_all("tr"):
    col=row.find_all("td")
    word=col[0].text
    definition=col[1].text
    sample=col[2].text
    #adding each new row to the data frame
    toefl_vocabulary=toefl_vocabulary.append({"Word":word,"Definition":definition,"Sample Sentence":sample},ignore_index=True)


In [14]:
#an issue came up, the symbol ' is misinterpret, we need to solve it 
toefl_vocabulary.head()

Unnamed: 0,Word,Definition,Sample Sentence
0,Wilt,To droop and become limp.,Plants will wilt if you donât water them reg...
1,Word,Definition,Sample Sentence
2,Abundant,Present in large quantities.,Living close to a lake means we have an abunda...
3,Accumulate,To gradually collect.,"Each fall, leaves accumulate in our driveway."
4,Accurate,Correct; free from errors.,Make sure your address is accurate before subm...


## Data cleaning

In [15]:
toefl_vocabulary=toefl_vocabulary.iloc[2:]

In [17]:
toefl_vocabulary=toefl_vocabulary.reset_index()

In [19]:
toefl_vocabulary.drop("index",axis=1,inplace=True)

In [20]:
toefl_vocabulary

Unnamed: 0,Word,Definition,Sample Sentence
0,Abundant,Present in large quantities.,Living close to a lake means we have an abunda...
1,Accumulate,To gradually collect.,"Each fall, leaves accumulate in our driveway."
2,Accurate,Correct; free from errors.,Make sure your address is accurate before subm...
3,Accustomed,Used to something.,Having 8AM classes means Iâm accustomed to g...
4,Acquire,To come into possession of.,"When my grandmother died, I acquired her cookb..."
...,...,...,...
322,Voluminous,Taking up a lot of space.,The puffy wedding dress had voluminous sleeves.
323,Whereas,On the contrary.,I always save my money whereas my brother is c...
324,Wholly,Completely.,The monk is wholly devoted to his faith.
325,Widespread,Occurring over a large region.,There is widespread poverty across that country.


In [24]:
# the symbol ' is unterpreter as â\x80\x99, lets fix it
toefl_vocabulary["Sample Sentence"][3],toefl_vocabulary["Sample Sentence"][326]

('Having 8AM classes means Iâ\x80\x99m accustomed to getting up early.',
 'Plants will wilt if you donâ\x80\x99t water them regularly.')

In [25]:
toefl_vocabulary["Sample Sentence"][3].replace("â\x80\x99","'")

"Having 8AM classes means I'm accustomed to getting up early."

In [27]:
#apply a lambda function to fix this issue 
toefl_vocabulary["Sample Sentence"].apply(lambda x: x.replace("â\x80\x99","'"))

0      Living close to a lake means we have an abunda...
1          Each fall, leaves accumulate in our driveway.
2      Make sure your address is accurate before subm...
3      Having 8AM classes means I'm accustomed to get...
4      When my grandmother died, I acquired her cookb...
                             ...                        
322      The puffy wedding dress had voluminous sleeves.
323    I always save my money whereas my brother is c...
324             The monk is wholly devoted to his faith.
325     There is widespread poverty across that country.
326    Plants will wilt if you don't water them regul...
Name: Sample Sentence, Length: 327, dtype: object

In [28]:

toefl_vocabulary["Sample Sentence"]=toefl_vocabulary["Sample Sentence"].apply(lambda x: x.replace("â\x80\x99","'"))
    

In [29]:
toefl_vocabulary.head()

Unnamed: 0,Word,Definition,Sample Sentence
0,Abundant,Present in large quantities.,Living close to a lake means we have an abunda...
1,Accumulate,To gradually collect.,"Each fall, leaves accumulate in our driveway."
2,Accurate,Correct; free from errors.,Make sure your address is accurate before subm...
3,Accustomed,Used to something.,Having 8AM classes means I'm accustomed to get...
4,Acquire,To come into possession of.,"When my grandmother died, I acquired her cookb..."


In [30]:
#just in case
toefl_vocabulary["Definition"]=toefl_vocabulary["Definition"].apply(lambda x: x.replace("â\x80\x99","'"))

In [31]:
#finally converting our data frame as a excel
toefl_vocabulary.to_excel("toefl_vocabulary.xlsx")