## **Installing Conda**

In [None]:
!which python # should return /usr/local/bin/python
!python --version
!echo $PYTHONPATH
%env PYTHONPATH=

In [None]:
%%bash
MINICONDA_INSTALLER_SCRIPT=Miniconda3-4.5.4-Linux-x86_64.sh
MINICONDA_PREFIX=/usr/local
wget https://repo.continuum.io/miniconda/$MINICONDA_INSTALLER_SCRIPT
chmod +x $MINICONDA_INSTALLER_SCRIPT
./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX

In [None]:
!which conda # should return /usr/local/bin/conda
!conda --version # should return 4.5.4
!which python # still returns /usr/local/bin/python
!python --version # now returns Python 3.6.5 :: Anaconda, Inc.

In [None]:
%%bash
conda install --channel defaults conda python=3.6 --yes
conda update --channel defaults --all --yes

In [None]:
!conda --version # now returns 4.8.3
!python --version # now returns Python 3.6.10 :: Anaconda, Inc.

In [None]:
import sys
_ = (sys.path
        .append("/usr/local/lib/python3.6/site-packages"))

In [None]:
!conda install --channel conda-forge featuretools --yes

## **Installing Scrapy**

In [None]:
!cd content/

In [None]:
!conda install -c conda-forge scrapy
!pip install Scrapy

#### **Note:** Below is the spider script which needs to be written in `group7_spider.py` **NOT HERE**!

In [None]:
import scrapy
import json
import re


class FactchecksSpider(scrapy.Spider):
    name = "FactChecks"
    allowed_domains = ["www.snopes.com"]
    start_urls = ["https://www.snopes.com/fact-check/"]
    url_set = set({"https://www.snopes.com/fact-check/"})
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0"
    }

    def parse(self, response):
        """
        This method crawls the pages and extracts:
        1. links to the articles
        2. link to next page
        """
        links = response.css(
            "body > div.theme-content > div > div > main > div > div.media-list > article.media-wrapper > a::attr(href)"
        ).extract()
        next_page = response.css(".btn-next::attr(href)").extract()
        for link in links:
            link = response.urljoin(link)
            if link not in self.url_set:
                self.url_set.add(link)
                yield scrapy.Request(
                    url=link, headers=self.headers, callback=self.parse_fact_details
                )
        if next_page:
            if next_page[0] not in self.url_set:
                self.url_set.add(next_page[0])
                yield scrapy.Request(
                    url=next_page[0], headers=self.headers, callback=self.parse
                )
        pass

    def parse_fact_details(self, response):
        """This method crawls and extracts the details of each article."""

        links_in_content = response.css("div.content:nth-child(2) a::attr(href)").extract()
        for link in links_in_content:
            if link.startswith("https://www.snopes.com/fact-check/") and (link not in self.url_set):
                self.url_set.add(link)
                yield scrapy.Request(
                    url=link, headers=FactchecksSpider.headers, callback=self.parse_fact_details
                )

        claim_list = response.css(".claim > p:nth-child(1) ::text").extract()
        claim = ''
        if len(claim_list):
            claim = claim_list[0]

        content = ''
        content_body_list = response.css("div.content:nth-child(2)").extract()
        if len(content_body_list):
            content_body = content_body_list[0]
            content = self.remove_html_tags(content_body)

        pattern = r'\bvar\s+snopesPageData\s*=\s*(\{.*?\})\s*;\s*\n'
        json_data = response.css("script::text").re_first(pattern)
        json_res = json.loads(json_data)
        title_list = response.css("h1.title::text").extract()
        title = title_list[0]
        item = FactCheckItem()

        if title:
            item["title"] = title
        if "url" in json_res:
            item["url"] = json_res["url"]
        if "date_published" in json_res:
            item["date_published"] = json_res["date_published"]
        if "rating" in json_res:
            item["rating"] = json_res["rating"]
        if "author_name" in json_res:
            item["author_name"] = json_res["author_name"]
        if "category" in json_res:
            item["category"] = json_res["category"]
        if "tags" in json_res:
            item["tags"] = json_res["tags"]
        item["claim"] = claim
        item["content"] = content
        yield item
        pass

    def remove_html_tags(self, text):
        """This method removes HTML tags from a string"""
        clean_script = re.compile('<script[^>]*>[\s\S​]*?</script>')
        clean_image_caption = re.compile('<figcaption[^>]*>[\s\S​]*?</figcaption>')
        clean_iframe = re.compile('<iframe[^>]*>[\s\S​]*?</iframe>')
        clean_all_tags = re.compile('<.*?>')

        # remove script tag with its content
        text = re.sub(clean_script, '', text)
        # remove caption of an image with its content
        text = re.sub(clean_image_caption, '', text)
        text = re.sub(clean_iframe, '', text)
        text = re.sub("[\n]+", " ", text)
        text = re.sub("[\t]+", "", text)
        # \xa0 is actually non-breaking space in Latin1 (ISO 8859-1), also chr(160).
        # You should replace it with a space.
        text = re.sub("[\xa0]+", " ", text)
        return re.sub(clean_all_tags, '', text)


    # def get_content(self, response):
    #     body = response.css("div.content:nth-child(2)").extract()
    #     links_in_content = response.css(
    #         "div.content:nth-child(2) a::attr(href)"
    #     ).extract()
    #     content = ""
    #     for link in links_in_content:
    #         if link.startswith("https://www.snopes.com/fact-check/") and (
    #                 link not in self.url_set
    #         ):
    #             self.url_set.add(link)
    #             yield scrapy.Request(
    #                 url=link, headers=FactchecksSpider.headers, callback=self.parse
    #             )
    #
    #     claim = response.css(".claim > p:nth-child(1) ::text").extract()
    #
    #     if claim[0]:
    #         content = claim[0]
    #     content  += self.get_content_of_p_tag(response)
    #     return content
    #
    # def get_content_of_p_tag(self, response):
    #     counter_p_tags = 1
    #     number_of_p_tags = len(response.xpath('/html/body/div[4]/div/div/main/article/div[7]/div[1]/p'.format(counter_p_tags)).extract())
    #     content = ''
    #     p_body = response.xpath('/html/body/div[4]/div/div/main/article/div[7]/div[1]/p[{}]/text()'.format(counter_p_tags)).extract()
    #     p_text_with_tag = response.xpath('/html/body/div[4]/div/div/main/article/div[7]/div[1]/p[{}]'.format(counter_p_tags)).extract()
    #     while counter_p_tags <= number_of_p_tags:
    #         all_p_text = ""
    #         p_str = ""
    #         if p_body:
    #             selector_list = Selector(text=p_text_with_tag[0]).xpath('//a/text()').extract()  # get all text of a tags in tag p
    #             length = len(selector_list)
    #             selector_list_index = 0
    #             for p in p_body:
    #                 p_str += p
    #                 if selector_list_index < length:
    #                     p_str += selector_list[selector_list_index]
    #                     selector_list_index += 1
    #             all_p_text = p_str
    #         content += 'p[{}]'.format(counter_p_tags) + all_p_text
    #         counter_p_tags += 1
    #         p_body = response.xpath('/html/body/div[4]/div/div/main/article/div[7]/div[1]/p[{}]/text()'.format(counter_p_tags)).extract()
    #         p_text_with_tag = response.xpath('/html/body/div[4]/div/div/main/article/div[7]/div[1]/p[{}]'.format(counter_p_tags)).extract()
    #
    #     return content




In [None]:
%cd ..

## **Run the Spider**

In [None]:
!pwd it should be /content/drive/MyDrive/exercise4/snopes/snopes

In [None]:
%cd /content/drive/MyDrive/exercise4/snopes/snopes

In [None]:
!scrapy crawl Factchecks -o result.csv 

In [None]:
from google.colab import files

files.download('./result.csv') 