## Crawling TVRatings Webpage using Scrapy in Python

### 1. Creat a scrapy project

In [1]:
!rm -rf tvratings_crawler
!scrapy startproject tvratings_crawler

New Scrapy project 'tvratings_crawler', using template directory '/Users/choetaebyeong/anaconda3/lib/python3.7/site-packages/scrapy/templates/project', created in:
    /Users/choetaebyeong/Documents/dev/TVratings_crawling/tvratings_crawler

You can start your first spider with:
    cd tvratings_crawler
    scrapy genspider example example.com


In [2]:
!tree tvratings_crawler

[01;34mtvratings_crawler[00m
├── scrapy.cfg
└── [01;34mtvratings_crawler[00m
    ├── __init__.py
    ├── [01;34m__pycache__[00m
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── [01;34mspiders[00m
        ├── __init__.py
        └── [01;34m__pycache__[00m

4 directories, 7 files


### 2. Declare items.py

In [3]:
!cat tvratings_crawler/tvratings_crawler/items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class TvratingsCrawlerItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


In [4]:
%%writefile tvratings_crawler/tvratings_crawler/items.py

import scrapy


class TvratingsCrawlerItem(scrapy.Item):
    genre = scrapy.Field()      # Genre: types of TV programs (General, Drama, Entertainment)
    rank = scrapy.Field()       # rank: top 20 rank
    program = scrapy.Field()    # program: name of TV program
    channel = scrapy.Field()    # channel: name of broadcast channel
    rate = scrapy.Field()       # rate: TV Ratings
    pass

Overwriting tvratings_crawler/tvratings_crawler/items.py


### 3. Write Spider.py

In [29]:
%%writefile tvratings_crawler/tvratings_crawler/spiders/spider.py
import scrapy
from tvratings_crawler.items import TvratingsCrawlerItem
from datetime import date, timedelta

class Spider(scrapy.Spider):

    name = "TVratings_Crawler"
    allow_domain = ["https://search.naver.com/"]
    
    def __init__(self, month=12, day=1, **kwargs):
        self.month=month        # Use when specifying a date
        self.day=day            # Use when specifying a date
        today = date.today()
        yesterday = date.today() - timedelta(1)
        self.month=yesterday.strftime('%m')
        self.day=yesterday.strftime('%d')  # If no date is specified, it is used as yesterday's date
        super().__init__(**kwargs)
  
    def start_requests(self, **kwargs):
        month = self.month
        day = self.day.zfill(2)
        urls = [
        "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=blUw&query={}%EC%9B%94{}%EC%9D%BC%20%EC%A2%85%ED%95%A9%20%EC%8B%9C%EC%B2%AD%EB%A5%A0".format(month, day),
        "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=blUw&query={}%EC%9B%94{}%EC%9D%BC%20%EB%93%9C%EB%9D%BC%EB%A7%88%20%EC%8B%9C%EC%B2%AD%EB%A5%A0".format(month, day),
        "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=blUw&query={}%EC%9B%94{}%EC%9D%BC%20%EC%98%88%EB%8A%A5%20%EC%8B%9C%EC%B2%AD%EB%A5%A0".format(month, day),
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for i in range(1, 21):
            genre = response.xpath('//*[@id="main_pack"]/div[1]/div/div[2]/div/h4/text()').extract()
            rank = response.xpath('//*[@id="main_pack"]/div[1]/div/div[2]/div/div[3]/div/table/tbody/tr[{}]/td[1]/p/span/span/text()'.format(i)).extract()
            program = response.xpath('//*[@id="main_pack"]/div[1]/div/div[2]/div/div[3]/div/table/tbody/tr[{}]/td[2]/p/a/text()'.format(i)).extract()
            channel = response.xpath('//*[@id="main_pack"]/div[1]/div/div[2]/div/div[3]/div/table/tbody/tr[{}]/td[3]/p/a/text()'.format(i)).extract()
            rate = response.xpath('//*[@id="main_pack"]/div[1]/div/div[2]/div/div[3]/div/table/tbody/tr[{}]/td[4]/p/text()'.format(i)).extract()
            
            item = TvratingsCrawlerItem()
            item["genre"] = genre
            item["rank"] = rank
            item["program"] = program
            item["channel"] = channel
            item["rate"] = rate
            yield item


Overwriting tvratings_crawler/tvratings_crawler/spiders/spider.py


### 4. Change  settings.py in case forbidden by robots.txt 
    - Go to settings.py in the project folder and change ROBOTSTXT_OBEY = True 
      to ROBOTSTXT_OBEY = False.

In [6]:
!grep "ROBOTSTXT_OBEY" tvratings_crawler/tvratings_crawler/settings.py

ROBOTSTXT_OBEY = True


In [7]:
!sed -i.bak 's/ROBOTSTXT_OBEY = True/ROBOTSTXT_OBEY = False/' tvratings_crawler/tvratings_crawler/settings.py

In [8]:
!grep "ROBOTSTXT_OBEY" tvratings_crawler/tvratings_crawler/settings.py

ROBOTSTXT_OBEY = False


### 5.  Install pymongo for saving the crawling results

In [9]:
!pip list | grep pymongo

pymongo                            3.9.0     


In [91]:
pip install pymongo

Note: you may need to restart the kernel to use updated packages.


In [92]:
!python -m pip install --upgrade pymongo

Requirement already up-to-date: pymongo in /home/ubuntu/.pyenv/versions/3.6.5/envs/python3/lib/python3.6/site-packages (3.10.0)


### 6.  Create mongodb module  to connect to the MongoDB

In [10]:
%%writefile tvratings_crawler/tvratings_crawler/mongodb.py
import pymongo

client = pymongo.MongoClient('mongodb://xxx.xxx.xxx.xxx:27017/')  # MongoDB IP Address 
db = client.tvrating_crawler
collection = db.tvrating

Writing tvratings_crawler/tvratings_crawler/mongodb.py


In [30]:
from tvratings_crawler.tvratings_crawler.mongodb import collection
collection

Collection(Database(MongoClient(host=['15.164.198.126:27017'], document_class=dict, tz_aware=False, connect=True), 'tvrating_crawler'), 'tvrating')

### 7.  Write Pipelines.py to send items to the mongodb

In [12]:
%%writefile tvratings_crawler/tvratings_crawler/pipelines.py
from .mongodb import collection

class TVratings_Pipeline(object):
    
    def process_item(self, item, spider):
        
        data = {"genre": item["genre"], 
                "rank": item["rank"],  
                "channel": item["channel"], 
                "program": item["program"], 
                "rate": item["rate"], 
               }
        
        collection.insert(data)
        
        return item

Overwriting tvratings_crawler/tvratings_crawler/pipelines.py


### 8.  Add the pipeline to the ITEM_PIPELINES setting in settings.py

In [13]:
!echo "ITEM_PIPELINES = {"  >> tvratings_crawler/tvratings_crawler/settings.py
!echo "   'tvratings_crawler.pipelines.TVratings_Pipeline': 300," >> tvratings_crawler/tvratings_crawler/settings.py
!echo "}" >> tvratings_crawler/tvratings_crawler/settings.py
!tail -n 5 tvratings_crawler/tvratings_crawler/settings.py


#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
ITEM_PIPELINES = {
   'tvratings_crawler.pipelines.TVratings_Pipeline': 300,
}


### 9.  Run TVratings_Crawler

In [27]:
%%writefile run_tvratings_crawler.sh
cd ~/Documents/dev/TVratings_crawling/
rm -rf tvratings_crawler/tvratings_crawler.csv
cd tvratings_crawler/
scrapy crawl tvratings_crawler -o tvratings_crawler.csv 

# How to crawl by specifying a datev (for example, December 6)
# - "scrapy crawl TVrating_Crawler -o tvrating_crawler.csv -a month=12 -a day=6"


Overwriting run_tvratings_crawler.sh


In [31]:
!chmod +x run_tvratings_crawler.sh
!./run_tvratings_crawler.sh

2019-12-13 12:48:14 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: tvratings_crawler)
2019-12-13 12:48:14 [scrapy.utils.log] INFO: Versions: lxml 4.3.4.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.7.3 (default, Mar 27 2019, 16:54:48) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.7, Platform Darwin-16.7.0-x86_64-i386-64bit
Traceback (most recent call last):
  File "/Users/choetaebyeong/anaconda3/lib/python3.7/site-packages/scrapy/spiderloader.py", line 69, in load
    return self._spiders[spider_name]
KeyError: 'tvratings_crawler'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/choetaebyeong/anaconda3/bin/scrapy", line 10, in <module>
    sys.exit(execute())
  File "/Users/choetaebyeong/anaconda3/lib/python3.7/site-packages/scrapy/cmdline.py", line 146, in execute
    _run_print_help(parser, _run_command, cmd,

In [16]:
!ls -altr tvratings_crawler/tvratings_crawler.csv 
!cat tvratings_crawler/tvratings_crawler.csv

ls: tvratings_crawler/tvratings_crawler.csv: No such file or directory
cat: tvratings_crawler/tvratings_crawler.csv: No such file or directory


### 10.  Change the format of result using pandas

In [76]:
import pandas as pd
df = pd.read_csv("tvratings_crawler/tvratings_crawler.csv", na_values = ['N/A', 'NA', 'nan', 'NaN', 'null'])      
df_sort = df[['genre','rank','channel','program','rate']].sort_values(by=['genre','rank'], ascending=[False, True])

df_sort['rank'] = [x.replace('nan','0') for x in df_sort['rank'].astype(str)]
df_sort['rank'] = df_sort['rank'].apply(pd.to_numeric).astype({'rank': int})
df_sort['program'] = [x.replace(' ','') for x in df_sort['program'].astype(str)]

!rm -rf tvratings_crawler/tvratings_crawler_sort.csv
df_sort.to_csv("tvratings_crawler/tvratings_crawler_sort.csv", index=False, na_rep='-')
df_sort.to_csv("tvratings_crawler/tvratings_crawler_mail.csv", encoding='cp949', index=False, na_rep='-')

!cat tvratings_crawler/tvratings_crawler_sort.csv

genre,rank,channel,program,rate
종합 일간 시청률,1,KBS1,꽃길만걸어요,17.2%
종합 일간 시청률,2,SBS,VIP2부,13.2%
종합 일간 시청률,3,KBS2,우아한모녀,13.0%
종합 일간 시청률,4,KBS1,KBS뉴스9,10.9%
종합 일간 시청률,5,KBS1,인간극장,10.2%
종합 일간 시청률,6,SBS,VIP1부,10.0%
종합 일간 시청률,7,KBS1,아침마당,9.8%
종합 일간 시청률,8,KBS1,6시내고향,8.3%
종합 일간 시청률,9,KBS1,KBS뉴스7,7.7%
종합 일간 시청률,10,KBS1,이웃집찰스,7.3%
종합 일간 시청률,11,SBS,불타는청춘1부,6.7%
종합 일간 시청률,11,SBS,본격연예한밤2부,6.7%
종합 일간 시청률,11,SBS,불타는청춘2부,6.7%
종합 일간 시청률,14,SBS,본격연예한밤1부,6.4%
종합 일간 시청률,15,KBS1,KBS930뉴스,6.1%
종합 일간 시청률,15,KBS1,KBS뉴스광장2부,6.1%
종합 일간 시청률,17,KBS2,2TV생생정보,5.9%
종합 일간 시청률,17,SBS,맛좀보실래요,5.9%
종합 일간 시청률,19,MBC,MBC뉴스데스크,4.5%
종합 일간 시청률,20,KBS1,동물의왕국,4.4%
예능 일간 시청률,1,KBS1,아침마당,9.8%
예능 일간 시청률,2,KBS1,이웃집찰스,7.3%
예능 일간 시청률,3,SBS,본격연예한밤2부,6.7%
예능 일간 시청률,3,SBS,불타는청춘1부,6.7%
예능 일간 시청률,3,SBS,불타는청춘2부,6.7%
예능 일간 시청률,6,SBS,본격연예한밤1부,6.4%
예능 일간 시청률,7,KBS1,역사저널그날,3.4%
예능 일간 시청률,8,KBS1,더라이브,2.9%
예능 일간 시청률,8,KBS2,정해인의걸어보고서2부,2.9%
예능 일간 시청률,10,MBC,편애중계2부,2.5%
예능 일간 시청률,11,MBC,편애중계1부,2.2%
예능 일간 시청률,12,SBS,좋은아침,

### 11.  Sending email with attachments after crawling the website

In [None]:
from email import encoders
from email.mime.multipart import MIMEMultipart 
from email.mime.text import MIMEText           
from email.mime.base import MIMEBase           

In [None]:
import smtplib, os, pickle

mail_addr = "from_email@gmail.com"                      # email account
pw = "PASSWORD"                                         # email account password
to_addr = ["to_email@naver.com","to_email@naver.com" ]  # recipient's email address

In [None]:
smtp = smtplib.SMTP("smtp.gmail.com", 587)
smtp.ehlo()
smtp.starttls()
smtp.login(mail_addr, pw)

In [None]:
# email subject
msg = MIMEMultipart()
msg["subject"] = "일일시청률_순위를_보내드립니다."  

In [None]:
# email body
part1 = MIMEText("로봇이 필요한 분께 자동으로 전송하여 보내드립니다.") 
msg.attach(part1)

In [None]:
# email body - link url
part2 = MIMEText("<a href='https://search.naver.com/search.naver?sm=top_hty&fbm=1&ie=utf8&query=%EC%8B%9C%EC%B2%AD%EB%A5%A0%EC%88%9C%EC%9C%84'> 일일 시청률 클릭!!!</a>", "html")
msg.attach(part2)

In [None]:
# copy attachment file
import shutil

path = "./tvrating_crawler/tvrating_crawler_sort.csv"
shutil.copy2(path, 'tvrating_crawler_sort.csv')
!ls -ltr tvrating_crawler_sort.csv

In [None]:
# copy and attache the result file
import shutil

file = "./tvratings_crawler/tvratings_crawler_sort.csv"
shutil.copy2(file, 'tvratings_crawler_mail.csv')
!ls -ltr tvratings_crawler_mail.csv

path = "tvratings_crawler_mail.csv"
with open(path, "rb") as f:
    part3 = MIMEBase("application", "octet-stream")
    part3.set_payload(f.read())
    encoders.encode_base64(part3)
    
part3.add_header("Content-Disposition", "attachment", filename=path)
msg.attach(part3)

In [None]:
# Send mail
for addr in to_addr:
    print(addr)
    msg["to"] = addr
    smtp.sendmail(mail_addr, addr, msg.as_string())