## Web Scraping

In this project I am to test my web scraping and machine learning knowledge.


#### The assignment

Create a new model that is able to extract products from Furniture Stores. 

#### Inputs

You’ll be given a list of URLs (700 URLs) from furnitures stores sites. Most will have products on them, some won’t, some won’t even work at all.

#### Outputs

We expect a list of product names extracted from every URL, but you can get creative in presenting your results. See what the most popular product is, aggregate all the pages of a site etc.
Try to showcase what your solution is best at.

#### Guidelines

An approach that usually works well with such extraction problems is to create a NER (Named Entity Recognition) model and train it to find your entities (you have one entity, ‘PRODUCT’).

- In order to create such a model you need training data, you can also extract that from the input pages.
- Crawl ~100 pages from the list above & extract the text from it.
- Find a way to tag some sample products from these texts.
- Train a new model from the examples you just made.
- Use it to extract product names from some new, unseen pages.

In [None]:
# We start by importing the necessary packages

import csv
import requests
from bs4 import BeautifulSoup
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

#Some of the 700 links present in our "furniture.csv" don't work, or don't have any products at all.
#We create an exception for these so they get ignored.

with open('furniture.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
            continue
        else:
            line_count += 1
        try:
            page = requests.get(row[0])
            soup = BeautifulSoup(page.text, 'html.parser')
            text = soup.get_text()
            print(text)
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")
            continue

#2) Find a way to tag some sample products from these texts.

import csv
from nltk import word_tokenize, pos_tag

with open('furniture.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
            continue
        else:
            line_count += 1
            page = requests.get(row[0])
            soup = BeautifulSoup(page.text, 'html.parser')
            text = soup.get_text()
            tokens = word_tokenize(text)                               #tokenizing the text
            tagged_words = pos_tag(tokens)                              #tagging the tokens
            print(tagged_words)

#3) Train a new model from the examples you just made.

import csv
from nltk.chunk import ne_chunk

training_data = []                                                    #creating an empty list for training data

with open('furniture.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
            continue
        else:
            line_count += 1
            page = requests.get(row[0])
            soup = BeautifulSoup(page.text, 'html.parser')
            text = soup.get_text()
            tokens = word_tokenize(text)
            tagged_words = pos_tag(tokens)
            named_entities = ne_chunk(tagged_words)                    #chunking the tagged words
            training_data.append(named_entities)                       #appending the chunked words to the training data list

#4)Use it to extract product names from some new, unseen pages.

import csv
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.tag import UnigramTagger, BigramTagger

# training the tagger
train_data = training_data[:500] # using first 500 entries of training data to train the tagger
unigram_tagger = UnigramTagger(train_data)
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)

# testing the tagger
with open('furniture.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    product_names = []
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
            continue
        else:
            line_count += 1
            page = requests.get(row[0])
            soup = BeautifulSoup(page.text, 'html.parser')
            text = soup.get_text()
            tokens = word_tokenize(text)
            tagged_words = bigram_tagger.tag(tokens) # tagging the tokens with the trained tagger
            named_entities = ne_chunk(tagged_words) # chunking the tagged words
            iob_tagged = tree2conlltags(named_entities) # converting the chunked words to IOtags
            product_list = [word for word, pos, chunk in iob_tagged if chunk == "B-PRODUCT"] # extracting the product names 
            product_names.extend(product_list)

# writing the extracted product names to a CSV file
with open('extracted_products.csv', mode='w', newline='') as product_file:
    product_writer = csv.writer(product_file)
    product_writer.writerow(['Product Name'])
    for product in product_names:
        product_writer.writerow([product])

# reimporting the CSV file and finding the most frequent extracted word
from collections import Counter

with open('extracted_products.csv') as csv_file:
    csv_reader = csv.reader(csv_file)
    line_count = 0
    product_names = []
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
            continue
        else:
            line_count += 1
            product_names.append(row[0])

# finding the most frequent extracted word
most_frequent_word = Counter(product_names).most_common(1)[0][0]
print(f"The most frequent extracted word is: {most_frequent_word}")