# Data Scrapping using Beautiful Soup

- Import Beautiful Soup

- Make a GET request to fetch Page Data

- Parse HTML

- Filter Relvant parts

## Web Scrapping 01 : Fetching Data

In [1]:
from urllib.request import urlopen

In [2]:
android_url = "https://en.wikipedia.org/wiki/Android_version_history"

In [3]:
android_data = urlopen(android_url)
print(type(android_data))

<class 'http.client.HTTPResponse'>


In [4]:
android_html = android_data.read()
#print(android_html)

In [5]:
android_data.close()

## Web Scrapping 02 : Using Beautiful Soup
### 2. Parsing Data

In [6]:
from bs4 import BeautifulSoup as soup

In [7]:
android_soup = soup(android_html, 'html.parser')
#print(android_soup)

In [8]:
android_soup.find_all('h1', {})

[<h1 class="firstHeading" id="firstHeading" lang="en">Android version history</h1>]

In [9]:
tables = android_soup.find_all('table', {'class': 'wikitable'})
print(len(tables))

31


In [10]:
android_table = tables[0]      #it extracts the html of the table
#print(android_table)

## Web Scrapping: Parsing HTML Tables

### 3. Extracting useful information

- remove undesired tags
- extract table header and data

In [11]:
#'th' gives the 'table header'
#'tr' gives the 'table rows'

In [12]:
headers = android_table.find_all('th')
print(len(headers))

5


In [13]:
headers[0]

<th>Code name
</th>

In [14]:
print(headers[0].text)

Code name



In [15]:
column_titles = [ct.text for ct in headers]
print(column_titles)

['Code name\n', 'Version number(s)\n', 'Initial release date\n', 'API level\n', 'References\n']


In [16]:
column_titles = [ct.text[:-1] for ct in headers] #to remove backslash
print(column_titles)

['Code name', 'Version number(s)', 'Initial release date', 'API level', 'References']


In [17]:
rows_data = android_table.find_all('tr')[1:]  #since we don't want headings
print(len(rows_data))
#first_row = rows_data[0]
#print(first_row)

18


In [18]:
first_row = rows_data[0].find_all('td',{})
for d in first_row:
    print(d.text[:-1])

No codename
1.0
September 23, 2008
1
[9]


In [19]:
table_rows = []
for row in rows_data:
    current_row = []
    rows_data = row.findAll('td',{})
    for idx,data in enumerate(rows_data):
       
        if idx ==1:
            current_row.append(data.text[:-1].split(": ")[-1])
        elif idx!=0 and idx!=3:
            current_row.append(data.text[:-1])
        else:
            current_row.append(data.text)
    
    table_rows.append(current_row)
    

In [20]:
print(table_rows)

[['No codename\n', '1.0', 'September 23, 2008', '1\n', '[9]'], ['1.1\n', 'February 9, 2009', '2', '[9][11]\n'], ['Cupcake\n', '1.5', 'April 27, 2009', '3\n', ''], ['Donut\n', '1.6', 'September 15, 2009', '4\n', '[12]'], ['Eclair\n', '2.0 – 2.1', 'October 26, 2009', '5 – 7\n', '[13]'], ['Froyo\n', '2.2 – 2.2.3', 'May 20, 2010', '8\n', '[14]'], ['Gingerbread\n', '2.3 – 2.3.7', 'December 6, 2010', '9 – 10\n', '[15]'], ['Honeycomb\n', '3.0 – 3.2.6', 'February 22, 2011', '11 – 13\n', '[16]'], ['Ice Cream Sandwich\n', '4.0 – 4.0.4', 'October 18, 2011', '14 – 15\n', '[17]'], ['Jelly Bean\n', '4.1 – 4.3.1', 'July 9, 2012', '16 – 18\n', '[18]'], ['KitKat\n', '4.4 – 4.4.4', 'October 31, 2013', '19 – 20\n', '[19]'], ['Lollipop\n', '5.0 – 5.1.1', 'November 12, 2014', '21 – 22\n', '[20]'], ['Marshmallow\n', '6.0 – 6.0.1', 'October 5, 2015', '23\n', '[21]'], ['Nougat\n', '7.0 – 7.1.2', 'August 22, 2016', '24 – 25\n', '[22][23][24][25]'], ['Oreo\n', '8.0 – 8.1', 'August 21, 2017', '26 – 27\n', '[26]'

## Web Scrapping 04: Creating CSV

### 4.Writing CSV Files

In [21]:
filename = 'android_version_history.csv'
with open(filename, 'w', encoding='utf-8') as f:
    #write the header
    header_string = ','.join(column_titles)
    header_string += '\n'
    f.write(header_string)
    
    
    for row in table_rows[:-1]:
        row_string = ""
        for w in row:
            w = w.replace(',','')
            row_string += w + ','
        row_string= row_string[:-1]   
        row_string += '\n'
        f.write(row_string)

## Web Scrapping 05: Cleaning Data

- remove unwanted commas and symbols
- undesired information

In [22]:
import pandas as pd

In [23]:
#we need to read commas in date

In [24]:
df = pd.read_csv('android_version_history.csv')

In [25]:
df.head(n=10)

Unnamed: 0,Code name,Version number(s),Initial release date,API level,References
0,No codename,,,,
1,,1.0,September 23 2008,1,
2,,[9],,,
3,1.1,,,,
4,,February 9 2009,2,[9][11],
5,Cupcake,,,,
6,,1.5,April 27 2009,3,
7,,,,,
8,Donut,,,,
9,,1.6,September 15 2009,4,


In [26]:
df.iloc[0][1]

nan

## Web Scraping 06: Scraping Local Files

### 6. Loading Local Files

In [28]:
with open('android1.html', encoding = 'utf-8')as f:
    page_soup = soup(f, 'html.parser')

In [30]:
page_soup.find_all('h1')

[<h1 class="firstHeading" id="firstHeading" lang="en">Android version history</h1>]

In [31]:
page_soup.find_all('table')

[<table class="wikitable">
 <tbody><tr>
 <th>Code name
 </th>
 <th>Version number(s)
 </th>
 <th>Initial release date
 </th>
 <th>API level
 </th>
 <th>References
 </th></tr>
 <tr>
 <td rowspan="2">No codename
 </td>
 <td>1.0
 </td>
 <td>September 23, 2008
 </td>
 <td>1
 </td>
 <td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-1"><a href="https://en.wikipedia.org/wiki/Android_version_history#cite_note-unofficial_and_official_codenames-9">[9]</a></sup>
 </td></tr>
 <tr>
 <td>1.1
 </td>
 <td>February 9, 2009
 </td>
 <td>2
 </td>
 <td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-2"><a href="https://en.wikipedia.org/wiki/Android_version_history#cite_note-unofficial_and_official_codenames-9">[9]</a></sup><sup class="reference" id="cite_ref-11"><a href="https://en.wikipedia.org/wiki/Android_version_history#cite_note-11">[11]</a></sup>
 </td></tr>
 <tr>
 <td><a href="https://en.wikipedia.org/wiki/Android_Cupcake" title="Android Cupcake">Cup