# Webscraping for bike models from Bikez.com.

### Import necessary depencies.

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Create a DataFrame with Model Year and URL.

In [2]:
models = pd.DataFrame(columns=['Model', 'Year', 'URL'])

### Get the starting URL. This is a list of years that links to the bike models for each year.

In [3]:
starting_url = 'https://bikez.com/years/index.php'
starting_request = requests.get(starting_url)
starting_soup = BeautifulSoup(starting_request.text, 'html.parser')

### Get all the years URLs.

In [4]:
even_years = starting_soup.findAll('td', {'class': 'even'})
odd_years = starting_soup.findAll('td', {'class': 'odd'})
all_years = even_years + odd_years

### For each year do a request and get the data about the models in each year.

In [5]:
def scrape_year(year_url):
    year_request = requests.get(year_url)
    year_soup = BeautifulSoup(year_request.text, 'html.parser')
    
    even_entries = year_soup.findAll('tr', ({'class': 'even'}))
    odd_entries = year_soup.findAll('tr', ({'class': 'odd'}))
    all_entries = even_entries + odd_entries
    
    year_models = pd.DataFrame(columns=['Model', 'Year', 'URL'])
    index = 0
    
    for entry in all_entries:
        if len(entry) == 1:
            model = entry.td.a.text
            if model =='': continue
            url = 'https://bikez.com' + entry.td.a['href'].split('..')[1]
        if len(entry) == 2:
            last_td = entry.findAll('td')[-1]
            if last_td.has_attr('colspan'):
                model = last_td.a.text
                if model =='': continue
                url = 'https://bikez.com' + last_td.a['href'].split('..')[1]
        if len(entry) == 3:
            model = entry.td.a.text
            if model =='': continue
            url = 'https://bikez.com' + entry.td.a['href'].split('..')[1]
        
        year_models.loc[index] =[model, 0, url]
        index += 1
    return year_models

In [6]:
for year in all_years:
    year_url = 'https://bikez.com' + year.a['href'].split('..')[1]
    year_models = scrape_year(year_url)
    year_models['Year'] = int(year.a.text)
    models = models.append(year_models)

### Save the DataFrame to a .csv file.

In [7]:
models

Unnamed: 0,Model,Year,URL
0,Adiva AD3 400,2019,https://bikez.com/motorcycles/adiva_ad3_400_20...
1,Adly GTA-150,2019,https://bikez.com/motorcycles/adly_gta-150_201...
2,Aeon 3D-350 R,2019,https://bikez.com/motorcycles/aeon_3d-350_r_20...
3,Aeon Cobra 220,2019,https://bikez.com/motorcycles/aeon_cobra_220_2...
4,Aeon Cobra 50,2019,https://bikez.com/motorcycles/aeon_cobra_50_20...
...,...,...,...
1,Ariel De Dion Tricycle,1898,https://bikez.com/motorcycles/ariel_de_dion_tr...
0,Hildebrand-Wolfmüller Motorrad,1896,https://bikez.com/motorcycles/hildebrand-wolfm...
1,Excelsior Motor Bicycle,1896,https://bikez.com/motorcycles/excelsior_motor_...
2,Marks Motor Bike,1896,https://bikez.com/motorcycles/marks_motor_bike...


In [8]:
models.to_csv('models.csv', index=False)

### Next up, scraping the data from every URL's page.