In [25]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import bs4 as bs4
from urllib.parse import urlparse
import requests
from collections import Counter
import pandas as pd
import re

In [26]:
df = pd.read_csv('../website_classification.csv')
df

Unnamed: 0,website_url,cleaned_website_text,Category
0,https://www.booking.com/index.htmlfree?aid=174...,official site good hotel accommodation big sav...,Travel
1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel
...,...,...,...
1578,https://www.solarwinds.com/security,it security management tools free software tri...,Cyber Security
1579,https://www.axonius.com,axonius cybersecurity asset management saas ma...,Cyber Security
1580,https://www.secureworks.com,secureworks cybersecurity leader proven threat...,Cyber Security
1581,https://www.trustwave.com,leading managed detection and response trustwa...,Cyber Security


In [27]:
class ScrapTool:
    def visit_url(self,website_url):
        '''
        Visit URL. Download the Content. Initialize the beautifulsoup object. Call parsing methods. Return Series object.
        '''
        content = requests.get(website_url,timeout=60).content
        soup = BeautifulSoup(content,'lxml')
        result = {
            "website_url": website_url,
            "website_name": self.get_website_name(website_url),
            "website_text": self.get_html_title_tag(soup)+self.get_html_meta_tags(soup)+self.get_html_heading_tags(soup)+
                                                               self.get_text_content(soup)
        }
        
        #Convert to Series object and return
        return pd.Series(result)
    
    def get_website_name(self,website_url):
        '''
        Example: returns "google" from "www.google.com"
        '''
        return "".join(urlparse(website_url).netloc.split(".")[-2])
    
    def get_html_title_tag(self,soup):
        '''Return the text content of <title> tag from a webpage'''
        return '. '.join(soup.title.contents)
    
    def get_html_meta_tags(self,soup):
        '''Returns the text content of <meta> tags related to keywords and description from a webpage'''
        tags = soup.find_all(lambda tag: (tag.name=="meta") & (tag.has_attr('name') & (tag.has_attr('content'))))
        content = [str(tag["content"]) for tag in tags if tag["name"] in ['keywords','description']]
        return ' '.join(content)
    
    def get_html_heading_tags(self,soup):
        '''returns the text content of heading tags. The assumption is that headings might contain relatively important text.'''
        tags = soup.find_all(["h1","h2","h3","h4","h5","h6"])
        content = [" ".join(tag.stripped_strings) for tag in tags]
        return ' '.join(content)
    
    def get_text_content(self,soup):
        '''returns the text content of the whole page with some exception to tags. See tags_to_ignore.'''
        tags_to_ignore = ['style', 'script', 'head', 'title', 'meta', '[document]',"h1","h2","h3","h4","h5","h6","noscript"]
        tags = soup.find_all(text=True)
        result = []
        for tag in tags:
            stripped_tag = tag.strip()
            if tag.parent.name not in tags_to_ignore\
                and isinstance(tag, bs4.element.Comment)==False\
                and not stripped_tag.isnumeric()\
                and len(stripped_tag)>0:
                result.append(stripped_tag)
        return ' '.join(result)

In [28]:
def cleaning_text(text):
    text = text.lower()
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    text = text.replace('\r',' ')
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r' +',' ',text)
    return text

In [29]:
def content_generation(website):
    scrapTool = ScrapTool()
    try:
        web = dict(scrapTool.visit_url(website))
        text = cleaning_text(web['website_text'])
        return text
    except:
        print('Error: ',website)
        return None

In [7]:
websites = [
    "https://www.cyberbit.com/",
    "https://www.sonicwall.com/"
]


for website in websites:
    text = content_generation(website)
    if text is not None:
        web = [website,text,'Cyber Security']
        df.loc[len(df)] = web

Error:  https://www.cyberbit.com/
Error:  https://www.sonicwall.com/


In [30]:
df.tail(100)

Unnamed: 0,website_url,cleaned_website_text,Category
1483,https://luno.com,buy bitcoin ethereum xrp and altcoins securely...,Cryptocurrency
1484,https://bitmex.com,bitmex most advanced crypto trading platform f...,Cryptocurrency
1485,https://deribit.com,deribit crypto options and futures exchange fo...,Cryptocurrency
1486,https://primexbt.com,primexbt no bitcoin trading platform trade cry...,Cryptocurrency
1487,https://bitbuy.ca,buy bitcoin canada best cryptocurrency exchang...,Cryptocurrency
...,...,...,...
1578,https://www.solarwinds.com/security,it security management tools free software tri...,Cyber Security
1579,https://www.axonius.com,axonius cybersecurity asset management saas ma...,Cyber Security
1580,https://www.secureworks.com,secureworks cybersecurity leader proven threat...,Cyber Security
1581,https://www.trustwave.com,leading managed detection and response trustwa...,Cyber Security


In [33]:
df.drop(df[(df['Category']=='Education') | (df['Category']=='Food') | (df['Category']=='Health and Fitness') | (df['Category']=='Photography') | (df['Category'] == 'Travel')].index, inplace = True)

In [34]:
df['Category'].value_counts()

Business/Corporate                 109
Streaming Services                 104
Sports                             104
E-Commerce                         102
Games                               98
News                                96
Computers and Technology            93
Cyber Security                      91
Law and Government                  84
Social Networking and Messaging     83
Cryptocurrency                      80
Adult                               21
Forums                              16
Name: Category, dtype: int64

In [35]:
df.drop(371, inplace = True)

In [22]:
df.to_csv('website_classification.csv',index=False)