In [47]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import bs4 as bs4
from urllib.parse import urlparse
import requests
from collections import Counter
import pandas as pd
import re

In [48]:
df = pd.read_csv('website_classification.csv')
df

Unnamed: 0,website_url,cleaned_website_text,Category
0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel
...,...,...,...
1489,https://bitbuy.ca,buy bitcoin canada best cryptocurrency exchang...,Cryptocurrency
1490,https://gemini.com,buy sell trade bitcoin other crypto currencies...,Cryptocurrency
1491,https://okex.com,buy bitcoin other cryptocurrencies cryptocurre...,Cryptocurrency
1492,https://coinbene.com,coinbenecom connection timed outconnection tim...,Cryptocurrency


In [49]:
class ScrapTool:
    def visit_url(self,website_url):
        '''
        Visit URL. Download the Content. Initialize the beautifulsoup object. Call parsing methods. Return Series object.
        '''
        content = requests.get(website_url,timeout=60).content
        soup = BeautifulSoup(content,'lxml')
        result = {
            "website_url": website_url,
            "website_name": self.get_website_name(website_url),
            "website_text": self.get_html_title_tag(soup)+self.get_html_meta_tags(soup)+self.get_html_heading_tags(soup)+
                                                               self.get_text_content(soup)
        }
        
        #Convert to Series object and return
        return pd.Series(result)
    
    def get_website_name(self,website_url):
        '''
        Example: returns "google" from "www.google.com"
        '''
        return "".join(urlparse(website_url).netloc.split(".")[-2])
    
    def get_html_title_tag(self,soup):
        '''Return the text content of <title> tag from a webpage'''
        return '. '.join(soup.title.contents)
    
    def get_html_meta_tags(self,soup):
        '''Returns the text content of <meta> tags related to keywords and description from a webpage'''
        tags = soup.find_all(lambda tag: (tag.name=="meta") & (tag.has_attr('name') & (tag.has_attr('content'))))
        content = [str(tag["content"]) for tag in tags if tag["name"] in ['keywords','description']]
        return ' '.join(content)
    
    def get_html_heading_tags(self,soup):
        '''returns the text content of heading tags. The assumption is that headings might contain relatively important text.'''
        tags = soup.find_all(["h1","h2","h3","h4","h5","h6"])
        content = [" ".join(tag.stripped_strings) for tag in tags]
        return ' '.join(content)
    
    def get_text_content(self,soup):
        '''returns the text content of the whole page with some exception to tags. See tags_to_ignore.'''
        tags_to_ignore = ['style', 'script', 'head', 'title', 'meta', '[document]',"h1","h2","h3","h4","h5","h6","noscript"]
        tags = soup.find_all(text=True)
        result = []
        for tag in tags:
            stripped_tag = tag.strip()
            if tag.parent.name not in tags_to_ignore\
                and isinstance(tag, bs4.element.Comment)==False\
                and not stripped_tag.isnumeric()\
                and len(stripped_tag)>0:
                result.append(stripped_tag)
        return ' '.join(result)

In [50]:
def cleaning_text(text):
    text = text.lower()
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    text = text.replace('\r',' ')
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r' +',' ',text)
    return text

In [51]:
def content_generation(website):
    scrapTool = ScrapTool()
    try:
        web = dict(scrapTool.visit_url(website))
        text = cleaning_text(web['website_text'])
        return text
    except:
        print('Error: ',website)
        return None

In [54]:
websites = [
    "https://www.cyberbit.com/",
    "https://www.sonicwall.com/"
]


for website in websites:
    text = content_generation(website)
    if text is not None:
        web = [website,text,'Cyber Security']
        df.loc[len(df)] = web

Error:  https://www.cyberbit.com/
Error:  https://www.sonicwall.com/


In [55]:
df.tail(100)

Unnamed: 0,website_url,cleaned_website_text,Category
1495,https://www.cisa.gov,home page cisashieldsup prc statesponsored cyb...,Cyber Security
1496,https://www.ncsc.gov.uk,ncsc,Cyber Security
1497,https://www.nist.gov/topics/cybersecurity,cybersecurity nistnist develops cybersecurity ...,Cyber Security
1498,https://www.schneier.com,schneier on securityschneier on security searc...,Cyber Security
1499,https://www.kaspersky.com,kaspersky cyber security solutions for home an...,Cyber Security
1500,https://www.symantec.com,symantec enterprise cloudto meet todays cyber ...,Cyber Security
1501,https://www.trendmicro.com,in cloud security endpoint cybersecurity tren...,Cyber Security
1502,https://www.mcafee.com,antivirus vpn identity privacy protection mcaf...,Cyber Security
1503,https://www.avast.com,avast download free antivirus vpn free easyjoi...,Cyber Security
1504,https://www.fireeye.com,living security trellixliving security learns ...,Cyber Security


In [38]:
df.drop([1535,1556,1561,1573,1575,1577,1585,1592],inplace=True)
df.reset_index(drop=True,inplace=True)

In [40]:
df.to_csv('website_classification.csv',index=False)