Problem 1

In [12]:
import requests
from scrapy.http import TextResponse
import time
import pandas as pd
import numpy as np

In [33]:
def book_title_scraper(response):
    titles=response.css("article[class='product_pod'] h3 a::attr(title)").extract()
    return titles

In [34]:
def book_rating_scraper(response):
    rows = response.css("article[class='product_pod']")
    ratings = []
    for i in rows:
        one_star = i.css("p[class='star-rating One']")
        two_star = i.css("p[class='star-rating Two']")
        three_star = i.css("p[class='star-rating Three']")
        four_star = i.css("p[class='star-rating Four']")
        five_star = i.css("p[class='star-rating Five']")
        if(len(one_star)==1):
            ratings.append("One")
        elif(len(two_star)==1):
            ratings.append("Two")
        elif(len(three_star)==1):
            ratings.append("Three")
        elif(len(four_star)==1):
            ratings.append("Four")
        else:
            ratings.append("Five")
    return ratings

In [35]:
def book_price_scraper(response):
    prices=response.css("p[class='price_color']::text").extract()
    return [float(i.replace("Â£","")) for i in prices]

In [36]:
def book_url_scraper(response):
    base_url = "http://books.toscrape.com/catalogue/";
    urls=response.css("article[class='product_pod'] h3 a::attr(href)").extract()
    return [base_url + i for i in urls]

In [37]:
def book_image_url_scraper(response):
    base_url = "http://books.toscrape.com/";
    image_urls=response.css("div[class='image_container'] a img::attr(src)").extract()
    return [base_url + i for i in image_urls]

In [38]:
def in_stock_scraper(response):
    in_stocks=response.css("p[class='instock availability']::text").extract()
    new_stocks = []
    for i in in_stocks:
        new_stock = i.replace("\n","")
        new_stock = new_stock.strip()
        if(new_stock):
            new_stocks.append(new_stock)
    return new_stocks

In [42]:
def description_scraper(response):
    description=response.css("div[id='product_description'] ~ p::text").extract()
    if(not description):
        return ""
    else:
        return description[0]

In [40]:
def genre_scraper(response):
    genre=response.css("ul[class='breadcrumb'] li a::text").extract()[2]
    return genre

In [44]:
all_pages = []
all_book_prices = []
for i in range(1,51):
    time.sleep(1)
    descriptions = []
    genres = []
    url_1 = f"http://books.toscrape.com/catalogue/page-{i}.html"
    page_1 = requests.get(url_1)
    response_1 = TextResponse(body=page_1.text,url = url_1,encoding = "utf-8")
    book_titles = book_title_scraper(response_1)
    book_ratings = book_rating_scraper(response_1)
    book_prices = book_price_scraper(response_1)
    book_urls = book_url_scraper(response_1)
    book_image_urls = book_image_url_scraper(response_1)
    book_in_stock= in_stock_scraper(response_1)
    for i in book_urls:
        time.sleep(1)
        url_2 = i
        page_2 = requests.get(url_2)
        response_2 = TextResponse(body=page_2.text,url = url_2,encoding = "utf-8")
        descriptions.append(description_scraper(response_2))
        genres.append(genre_scraper(response_2))
    all_pages.append(pd.DataFrame({"Title":book_titles,"Rating":book_ratings,"Price":book_prices,"Book Url":book_urls,"Image Url":book_image_urls,"In Stock":book_in_stock,"Description":descriptions,"Genre":genres}))
    df = pd.concat(all_pages)
    df.to_csv('Books.csv',index = False)

    

In [48]:
df.Price.mean()

# The price mean of all books.

35.07034999999999

In [49]:
df.groupby('Genre').mean().sort_values(by=["Price"],inplace = False, ascending = False)

# The most expensive genre is Suspense

Unnamed: 0_level_0,Price
Genre,Unnamed: 1_level_1
Suspense,58.33
Novels,54.81
Politics,53.613333
Health,51.4525
New Adult,46.383333
Christian,42.496667
Sports and Games,41.166
Self Help,40.62
Travel,39.794545
Fantasy,39.593958


In [50]:
df.groupby("Rating").mean()

# High rated books are slightly expensive, but the difference is not too much. 

Unnamed: 0_level_0,Price
Rating,Unnamed: 1_level_1
Five,35.37449
Four,36.093296
One,34.561195
Three,34.69202
Two,34.810918


Problem 2

In [60]:
def vacancy_name_scraper(response):
    vacancy=response.css("p[class='font_bold']::text").extract()
    return vacancy

In [62]:
def company_name_scraper(response):
    company=response.css("p[class='job_list_company_title']::text").extract()
    return company

In [82]:
def posting_deadline_scraper(response):
    deadline=response.css("div[class='job-inner job-list-deadline'] p:first-child::text").extract()
    new_deadlines=[]
    for i in deadline:
        i = i.replace("\n"," ");
        i = i.strip()
        if(i):
            new_deadlines.append(i)
        #print(i)
    return new_deadlines

In [105]:
def location_scraper(response):
    location=response.css("p[class='job_location']::text").extract()
    new_locations = []
    for i in location:
        i = i.replace("\n","")
        if(len(i)>2):
            #print(i)
            new_locations.append(i)
    return new_locations

In [95]:
def page_url_scraper(response):
    base_url = "https://staff.am"
    urls=response.css("a[class='load-more btn width100 job_load_more radius_changes']::attr(href)").extract()
    return [base_url + i for i in urls]

In [108]:
url_3 = "https://staff.am/en/jobs"
base_url_3 = "https://staff.am"
all_pages = []
while(True):
    page_3 = requests.get(url_3)
    response_3 = TextResponse(body=page_3.text,url = url_3 ,encoding = "utf-8")
    vacancy_names = vacancy_name_scraper(response_3)
    company_names = company_name_scraper(response_3)
    deadlines = posting_deadline_scraper(response_3)
    locations = location_scraper(response_3)
    page_urls = page_url_scraper(response_3)
    
    all_pages.append(pd.DataFrame({"Vacancy":vacancy_names,"Company":company_names,"Deadline":deadlines,"Location":locations,"Page Url":page_urls}))
    next = response_3.css("li[class='next'] a::attr(href)").extract()
    if not next:
        break
    else:
        url_3 = base_url_3 + next[0]
        #time.sleep(1)
pd.concat(all_pages)
df = pd.concat(all_pages)

In [112]:
df.groupby("Company").count().sort_values(by=["Vacancy"],inplace = False, ascending = False)

# The most popular company is Digitain with 32 posted jobs right now.

Unnamed: 0_level_0,Vacancy,Deadline,Location,Page Url
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Digitain,32,32,32,32
SoftConstruct,29,29,29,29
PicsArt,23,23,23,23
ServiceTitan,16,16,16,16
TeamViewer Armenia,12,12,12,12
...,...,...,...,...
Devolon Armenia,1,1,1,1
Dexatel,1,1,1,1
Rocket Systems,1,1,1,1
Renderforest,1,1,1,1


In [115]:
print(df.Vacancy.str.contains("Data").value_counts())
print(df.Vacancy.str.contains("data").value_counts())

False    565
True       9
Name: Vacancy, dtype: int64
False    574
Name: Vacancy, dtype: int64


In [None]:
# There are no jobs that have "data" in their name
# But there are 9 jobs that have "Data" in their name