In [1]:
#importing the library used to query a website

import pandas as pd
import numpy as np
import seaborn as sns
#to easily display plots

%matplotlib inline           
import matplotlib.pyplot as plt

import urllib.request 
from urllib.request import urlopen
#import the beautiful soup functions to parse the data returned from the website.
from bs4 import BeautifulSoup

#Specify the url
#url="https://doris.delhigovt.nic.in"
#url="https://doris.delhigovt.nic.in/login.aspx"
url="https://doris.delhigovt.nic.in/C_search_result.aspx?vcode=WTkRrNMSaMo=&regyr=ghzKDTOo9wE=&bookno=jj9rhDUtB8Y=&prop_add=oUQq/BSORIc=&deed=ghzKDTOo9wE=&subdeed=oUQq/BSORIc=&regno=oUQq/BSORIc=&f_party=oUQq/BSORIc=&s_party=oUQq/BSORIc=&Digest=dUXMJiUaFutZVmB2rt9VGQ"
html = urlopen(url)
    

In [2]:
# Getting the html of the page is just the first step. Next step is to create a Beautiful Soup object from the html. 
# This is done by passing the html to the BeautifulSoup() function. The Beautiful Soup package is used to parse the html,
# that is, take the raw html text and break it into Python objects. The second argument 'lxml' is the html parser


In [3]:
#Parse the html in the 'soup' variable and store it in Beautiful Soup form
soup = BeautifulSoup(html, 'lxml')
type(soup)


bs4.BeautifulSoup

In [4]:
# The soup object allows you to extract interesting information about the website you're scraping such as getting 
# the title of the page as shown below.

title=soup.title
print(title)

<title>
	Untitled Page
</title>


In [5]:
text= soup.get_text()
#print (text)
#print(soup.text)


In [6]:
soup.find_all('a')


[<a href="login.aspx" id="HyperLink1"><b>Click Here For Login Page</b></a>]

In [7]:
#can use a for loop and the get("href") method to extract and print out only the hyperlinks.
all_links=soup.find_all("a")
for link in all_links:
    print(link.get("href"))

login.aspx


In [8]:
#to print out table rows only we pass the tr argument in soup.find_all
rows=soup.find_all('tr')
print(rows[:10])

[<tr>
<td align="center" colspan="2">
<span id="lblmsg"><font color="Red">Some Error occured. Please Enter Currect Url</font></span>
</td>
</tr>, <tr>
<td align="center" colspan="2" style="height: 21px">
<a href="login.aspx" id="HyperLink1"><b>Click Here For Login Page</b></a>
</td>
</tr>]


In [9]:
# We want to take a table from a webpage and convert it into a dataframe for easier manipulation using Python. To get there, 
#we should get all table rows in list form first and then convert that list into a dataframe. Below is a for loop that
# iterates through table rows and prints out the cells of the rows.
for row in rows:
    row_td=row.find_all('td')
    print(row_td)
    type(row_td)
    

[<td align="center" colspan="2">
<span id="lblmsg"><font color="Red">Some Error occured. Please Enter Currect Url</font></span>
</td>]
[<td align="center" colspan="2" style="height: 21px">
<a href="login.aspx" id="HyperLink1"><b>Click Here For Login Page</b></a>
</td>]


In [10]:
#The output above shows that each row is printed with html tags embedded in each row. This is not what we want. 
#we can use remove the html tags using Beautiful Soup or regular expressions.
string_cells=str(row_td)
cleantext = BeautifulSoup(string_cells,"lxml").get_text()
print(cleantext)

[
Click Here For Login Page
]


In [11]:
import re

list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)
print(clean2)
type(clean2)

[
Click Here For Login Page
]


str

In [12]:
df = pd.DataFrame(list_rows)
df.head()

Unnamed: 0,0
0,[\nSome Error occured. Please Enter Currect Ur...
1,[\nClick Here For Login Page\n]


In [13]:
df1 = df[0].str.split(',', expand=True)
df1.head(10)

Unnamed: 0,0
0,[\nSome Error occured. Please Enter Currect Ur...
1,[\nClick Here For Login Page\n]


In [14]:
df1[0] = df1[0].str.strip('[ ')
#df1[0] = df1[0].str.strip('] ')
df1.head(10)

Unnamed: 0,0
0,\nSome Error occured. Please Enter Currect Url\n]
1,\nClick Here For Login Page\n]


In [15]:
# the table is missing table headers.
col_labels = soup.find_all('th')
col_labels

[]

In [16]:
all_header = []
col_str = str(col_labels)
cleantext2 = BeautifulSoup(col_str, "lxml").get_text()
all_header.append(cleantext2)
print(all_header)

['[]']


In [17]:
#session= requests.session()
#data=session.get(url).content

In [18]:
df2 = pd.DataFrame(all_header)
df2.head()

Unnamed: 0,0
0,[]


In [19]:
df3=df2[0].str.split(',',expand=True)
df3.head()

Unnamed: 0,0
0,[]


In [20]:
frames=[df3,df1]
df4=pd.concat(frames)
df4.head(10)

Unnamed: 0,0
0,[]
0,\nSome Error occured. Please Enter Currect Url\n]
1,\nClick Here For Login Page\n]


In [21]:
df4.shape

(3, 1)

In [22]:
df5=df4.rename(columns=df4.iloc[0])
df5.head()

Unnamed: 0,[]
0,[]
0,\nSome Error occured. Please Enter Currect Url\n]
1,\nClick Here For Login Page\n]


In [23]:
df5.info()
df5.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 0 to 1
Data columns (total 1 columns):
[]    3 non-null object
dtypes: object(1)
memory usage: 48.0+ bytes


(3, 1)

In [24]:
df6=df5.dropna(axis=0,how='any')

In [25]:
df7=df6.drop(df6.index[0])
df7.head()

Unnamed: 0,[]
1,\nClick Here For Login Page\n]
