# Parsing of a .html file
## Naive Implementation

In [1]:
# reading line to line
html_naive = ""
# example html from wiki-a corpus
with open('files/Aakrosh_(1998_film).html') as file:
    lines = file.readlines()
    for line in lines:
        html_naive += line
print(html_naive)

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
	<!-- headlinks removed -->
	<link rel="shortcut icon" href="../../../../misc/favicon.ico"/>
    <title>Aakrosh (1998 film) - Wikipedia, the free encyclopedia</title>
    <style type="text/css">/*<![CDATA[*/ @import "../../../../skins/offline/main.css"; /*]]>*/</style>
    <link rel="stylesheet" type="text/css" media="print" href="../../../../skins/common/commonPrint.css" />
    <!--[if lt IE 5.5000]><style type="text/css">@import "../../../../skins/monobook/IE50Fixes.css";</style><![endif]-->
    <!--[if IE 5.5000]><style type="text/css">@import "../../../../skins/monobook/IE55Fixes.css";</style><![endif]-->
    <!--[if IE 6]><style type="text/css">@import "../../../../skins/monobook/IE60Fixes.css";</style>

In [2]:
import re
# everything between ">" and "<"
tag_regex = r"(?<=>)(.*?)(?=<)"

naive_parsed_str = ""
for line in html_naive.splitlines():
    search = re.search(tag_regex, line)
    if search is not None:
        naive_parsed_str += search.group(0)
# retrieve tokens
naive_tokens = naive_parsed_str.split(" ")

print(naive_tokens)

['Aakrosh', '(1998', 'film)', '-', 'Wikipedia,', 'the', 'free', 'encyclopedia/**/Aakrosh', '(1998', 'film)From', 'Wikipedia,', 'the', 'free', 'encyclopediaFor', 'the', '1980', 'film', 'by', 'Directed', 'byLateef', 'BinnyProduced&#160;byRamesh', 'J.', 'SharmaStarringShilpa', 'ShettyGirish', 'KarnadMusic&#160;byRelease&#160;date(s)CountryLanguageDue', 'to', 'his', 'activities,', 'Anjali', 'Gulraj', 'separates', 'from', 'Mahendra', 'Pratap', 'Gujral,', 'and', 'marries', 'Dr.', 'Malhotra.', 'She', 'does', 'bring', 'up', 'her', 'son,', 'Dev', 'from', 'her', 'first', 'marriage,', 'who', 'grows', 'up', 'to', 'become', 'a', 'Police', 'Officer.', 'Years', 'later,', 'on', 'the', 'tracks', 'of', 'Gujral,', 'Dev,', 'and', 'his', 'close', 'colleague,', 'Komal', 'come', 'across', 'Suraj', 'Singh,', 'and', 'find', 'out', 'that', 'he', 'is', 'indeed', 'Gujral.', 'At', 'this', 'point', 'Gujral', 'decides', 'to', 'play', 'on', "Dev's", 'emotions', 'by', 'reminding', 'of', 'his', 'past', 'affection', 'fo

## More advanced variation

In [3]:
from bs4 import BeautifulSoup

# get file and pass it to soup parser
html_file = open('files/Aakrosh_(1998_film).html', "r")
soup = BeautifulSoup(html_file.read(), features="html.parser")

# remove any script or styles from html file
for script in soup(["script", "style"]):
    script.extract()

# remove known tags that are irrelevant
for tag in soup.find_all("span", class_="editsection"):
    tag.decompose()
soup.prettify()

# get file content as text (string)
html_text = soup.get_text()

# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in html_text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = ' '.join(chunk for chunk in chunks if chunk)
# remove unnecessary punctuations such as ",", "|", etc.
text = text\
    .replace(":", ".")\
    .replace(".", ")")\
    .replace(")", "(")\
    .replace("(", "-")\
    .replace("-", "•")\
    .replace("•", "|")\
    .replace("|", ";")\
    .replace(";", ",")\
    .replace(",", "")
# remove any numbers
text = re.sub('\d+', ' ', text)
# turn multi spaces into single spaces
text = re.sub('\s+', ' ', text)

# retrieve tokens
tokens = text.split(" ")

print(tokens)

['Aakrosh', 'film', 'Wikipedia', 'the', 'free', 'encyclopedia', 'Aakrosh', 'film', 'From', 'Wikipedia', 'the', 'free', 'encyclopedia', 'For', 'the', 'film', 'by', 'Govind', 'Nihalani', 'see', 'Aakrosh', 'film', 'Aakrosh', 'Cyclone', 'of', 'Anger', 'DVD', 'cover', 'Directed', 'by', 'Lateef', 'Binny', 'Produced', 'by', 'Ramesh', 'J', 'Sharma', 'Starring', 'Sunil', 'Shetty', 'Shilpa', 'Shetty', 'Girish', 'Karnad', 'Music', 'by', 'Anand', 'Raj', 'Anand', 'Release', 'dates', 'April', 'Country', 'India', 'Language', 'Hindi', 'IMDb', 'profile', 'Aakrosh', 'English', 'languageAakroshCyclone', 'of', 'Anger', 'is', 'a', 'Bollywood', 'action', 'crime', 'thriller', 'directed', 'by', 'Lateef', 'Binny', 'starring', 'Sunil', 'Shetty', 'Shilpa', 'Shetty', 'and', 'Girish', 'Karnad', 'Synopsis', 'Due', 'to', 'his', 'activities', 'Anjali', 'Gulraj', 'separates', 'from', 'Mahendra', 'Pratap', 'Gujral', 'and', 'marries', 'Dr', 'Malhotra', 'She', 'does', 'bring', 'up', 'her', 'son', 'Dev', 'from', 'her', 'f

Applying Porter Stemmer:

In [4]:
from nltk.stem import PorterStemmer

# apply stemming on every word
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]

print(stemmed_tokens)

['aakrosh', 'film', 'wikipedia', 'the', 'free', 'encyclopedia', 'aakrosh', 'film', 'from', 'wikipedia', 'the', 'free', 'encyclopedia', 'for', 'the', 'film', 'by', 'govind', 'nihalani', 'see', 'aakrosh', 'film', 'aakrosh', 'cyclon', 'of', 'anger', 'dvd', 'cover', 'direct', 'by', 'lateef', 'binni', 'produc', 'by', 'ramesh', 'j', 'sharma', 'star', 'sunil', 'shetti', 'shilpa', 'shetti', 'girish', 'karnad', 'music', 'by', 'anand', 'raj', 'anand', 'releas', 'date', 'april', 'countri', 'india', 'languag', 'hindi', 'imdb', 'profil', 'aakrosh', 'english', 'languageaakroshcyclon', 'of', 'anger', 'is', 'a', 'bollywood', 'action', 'crime', 'thriller', 'direct', 'by', 'lateef', 'binni', 'star', 'sunil', 'shetti', 'shilpa', 'shetti', 'and', 'girish', 'karnad', 'synopsi', 'due', 'to', 'hi', 'activ', 'anjali', 'gulraj', 'separ', 'from', 'mahendra', 'pratap', 'gujral', 'and', 'marri', 'dr', 'malhotra', 'she', 'doe', 'bring', 'up', 'her', 'son', 'dev', 'from', 'her', 'first', 'marriag', 'who', 'grow', '

Removing stopwords:

In [5]:
import nltk
from nltk.corpus import stopwords
# update/download database if necessary
nltk.download('stopwords')

# filter stop words
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in stemmed_tokens if word not in stop_words]

print(str(filtered_words) + "\n")
print(str(len(filtered_words)) + " (filtered) vs. " + str(len(stemmed_tokens)) + " (unfiltered)")

['aakrosh', 'film', 'wikipedia', 'free', 'encyclopedia', 'aakrosh', 'film', 'wikipedia', 'free', 'encyclopedia', 'film', 'govind', 'nihalani', 'see', 'aakrosh', 'film', 'aakrosh', 'cyclon', 'anger', 'dvd', 'cover', 'direct', 'lateef', 'binni', 'produc', 'ramesh', 'j', 'sharma', 'star', 'sunil', 'shetti', 'shilpa', 'shetti', 'girish', 'karnad', 'music', 'anand', 'raj', 'anand', 'releas', 'date', 'april', 'countri', 'india', 'languag', 'hindi', 'imdb', 'profil', 'aakrosh', 'english', 'languageaakroshcyclon', 'anger', 'bollywood', 'action', 'crime', 'thriller', 'direct', 'lateef', 'binni', 'star', 'sunil', 'shetti', 'shilpa', 'shetti', 'girish', 'karnad', 'synopsi', 'due', 'hi', 'activ', 'anjali', 'gulraj', 'separ', 'mahendra', 'pratap', 'gujral', 'marri', 'dr', 'malhotra', 'doe', 'bring', 'son', 'dev', 'first', 'marriag', 'grow', 'becom', 'polic', 'offic', 'year', 'later', 'track', 'gujral', 'dev', 'hi', 'close', 'colleagu', 'komal', 'come', 'across', 'suraj', 'singh', 'find', 'inde', 'g

[nltk_data] Downloading package stopwords to /home/av11/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
