In [12]:
# Source_1_Web_Scraping
# This class containes the code for scraping car related data from a car website.
# Detailed description availble through inline comments.

import re
import pandas as pd
from re import sub
from decimal import Decimal
from requests import get
from bs4 import BeautifulSoup as bs  ## importing Beautiful Soup

# Initialising empty lists for storing relvant information as per the list name
car_names_with_variant_information = []
car_variant_price_range = []
car_power = []
car_mileage = []
car_fueltype = []


# URL of car website (Autoportal), which houses car related information.
# Detail of cars sorted by 'Popularity' criteria was available through the URL.
# Note: The number of records to be scraped from the website was limited with the sole aim of using the same as 
# limited input to obtain car popularity data from Twitter, as twitter is having restriction with number of requests within
# time window.
# Results were obtained from first three pages of the website through for loop.
for page_number in range(1,3):
    i = str(page_number)
    url = "https://autoportal.com/newcars/car-finder/2:46/2:13/2:14/2:16/2:47/2:49/2:51/2:10/2:8/2:11/2:12/2:55/2:64/2:65/2:67/2:70/2:71/2:73/2:75/2:82/5:30/8:43/page/"+i+"/date/"
    response = get(url)
    
    
    
    ## Python's in built library HTML parser
    html_soup = bs(response.text,'html.parser')


    # The class name of each and every car blaok was identified as 'results-varient and was extracted and stored in 
    # 'car_container'
    car_container = html_soup.find_all(class_ ="results-variant")


    # Looping through every individual car stored in car_container
    for car_individual in car_container:
    
        # Picking up the Full name information of the car (Make/Model/Variant number) from <a> tag and storing it in 
        # 'car_names_with_variant_information' list
        full_car_name_variant = car_individual.find("a").text
        car_names_with_variant_information.append(full_car_name_variant.strip() )
    
    
        # Picking up the price information of car from <div> tag and storing, whose class name is 'td price'
        car_pricerange = car_individual.find("div", class_ = "td price").text
    
    
        # Replacing the additional text found along with the car price with blank space and stripping the unnecessary space
        car_pricerange_replaced = car_pricerange.replace('   Ex-Showroom price in New Delhi', '')
        car_pricerange_space_stripped = car_pricerange_replaced.strip()
    
    
        # Converting the string format to decimal format.
        car_pricerange_decimal_formatted = Decimal(sub(r'[^\d.]', '', car_pricerange_space_stripped))
    
    
        # The car website used 'lakh' and 'cr' string values to denote the monetary denomination.
        # So, converting the decimal formatted price value to exact numerical figure by multiplication of relevant zeros
        if re.search('lakh',car_pricerange_space_stripped,re.IGNORECASE):
            car_pricerange_monetary_formatted = car_pricerange_decimal_formatted * 100000
        elif re.search('Cr',car_pricerange_space_stripped,re.IGNORECASE):
            car_pricerange_monetary_formatted = car_pricerange_decimal_formatted * 10000000
        else:
            continue
        # Storing the formatted monetary value in car_variant_price_range list
        car_variant_price_range.append(car_pricerange_monetary_formatted)
    
    
        # Picking up the specific details of car from <span> tag and storing them in car_specs_list
        car_specs_list = car_individual.find_all("span")
    
    
        # The car website used 'rpm/bhp/ps' string values to denote the car power, 'Petrol/Diesel/Hybrid/CNG/Electric'string values
        # to denote the car fuel type and 'liter' to denote car fueltype
        for car_specs_span_item in car_specs_list:
            if (re.search('rpm',car_specs_span_item.text,re.IGNORECASE) or 
                re.search('bhp',car_specs_span_item.text,re.IGNORECASE) or 
                    re.search('ps',car_specs_span_item.text,re.IGNORECASE)):
                        car_power.append(car_specs_span_item.text)
            elif (re.search('Petrol',car_specs_span_item.text,re.IGNORECASE) or
                 re.search('Diesel',car_specs_span_item.text,re.IGNORECASE) or 
                     re.search('Hybrid',car_specs_span_item.text,re.IGNORECASE) or 
                        re.search('CNG',car_specs_span_item.text,re.IGNORECASE) or 
                            re.search('Electric',car_specs_span_item.text,re.IGNORECASE)):
                 car_fueltype.append(car_specs_span_item.text)
            elif re.search('liter',car_specs_span_item.text,re.IGNORECASE):
                car_mileage.append(car_specs_span_item.text)
            else:
                continue
            
            
            
# Creating a new dataframe by name 'car_web_scrapped_df' and incorporating all bove mentioned list values into the same           
car_web_scrapped_df = pd.DataFrame({"Car_Fullname":car_names_with_variant_information,
                        "Car_Price_In_INR":car_variant_price_range,
                        "Car_Power":car_power,
                        "Car_Mileage":car_mileage,
                        "Car_Fueltype":car_fueltype
                       })



# Exporting the cleaned data (fit for conceptual data model) to a new csv file    
car_web_scrapped_df.to_csv("Source_Web_Scraping_Car_Specific_Data.csv", index=False)



""""
CONCLUSIONS:

This code scraps data from a car website and stores them in relevant lists, stores them in a data frame and exports it to a CSV file.

CONTRIBUTIONS:

RAJENDRA KUMAR RAJKUMAR had wriiten the code for the same and was reviewed by MONISH  HIRISAVE RAGHU

CITATIONS:

1. https://www.geeksforgeeks.org
2. https://github.com/nikbearbrown/INFO_6210

LICENSE:

Copyright <2019> <RAJENDRA KUMAR RAJKUMAR, MONISH  HIRISAVE RAGHU>

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


"""
