This project will scrape data from VerseTracker and place it into a data frame. We save the dataframe as a csv and create a tunnel to google big query.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import time
from google.cloud import bigquery
import pandas_gbq
import os 

In [2]:
index = 0
battle_rappers_dict = {"Battle Rapper": [], "Hometown": [], "Bio": [], "Total Battles": [], "Total Views": [], "Average Views": []}
prepend_url = 'https://versetracker.com'

while True:  
    print("Page Extracted:", index)
    
    response = requests.get(
        f"https://versetracker.com/all-rappers?name_1=&field_country_tid=21227&field_gender_value=&field_speaks_english_value=&sort_by=field_total_views_value&page={index}"
    )
    print("Response code:", response)

    soup = BeautifulSoup(response.text, 'html.parser')
    extractedRows = soup.find_all('div', class_="views-row")

    if not extractedRows:
        print("No more pages. Exiting.")
        break

    for row in extractedRows:
        rapperColumns = row.find_all('div', class_='views-column')

        for col in rapperColumns:
            name_tag = col.find('div', class_="rapper-grid-rapper-name")

            if not name_tag:
                continue

            name = name_tag.text.strip()
            link = col.find('a').get('href')
            full_url = prepend_url + link

            battle_rappers_dict["Battle Rapper"].append(name)

            try:

                profile_html = requests.get(full_url).text
                br_soup = BeautifulSoup(profile_html, 'html.parser')

                hometown = br_soup.find('div', class_="rapper-location-info")
                bio = br_soup.find('div', class_="rapper-bio")
                stats = br_soup.find('div', class_="rapper-stat-block")

                battle_rappers_dict["Hometown"].append(hometown.text.strip() if hometown else "")
                battle_rappers_dict["Bio"].append(bio.text.strip() if bio else "")

                if stats:
                    total_views = stats.find("span", class_="num-font").text
                    total_battles = stats.next_sibling.find('span', class_="num-font").text
                    average_views = stats.next_sibling.next_sibling.find('span', class_="num-font").text

                    battle_rappers_dict["Total Views"].append(total_views)
                    battle_rappers_dict["Total Battles"].append(total_battles)
                    battle_rappers_dict["Average Views"].append(average_views)

            except Exception as e:
                print(f"Error scraping {name}: {e}")

    index += 1

Page Extracted: 0
Response code: <Response [200]>
Page Extracted: 1
Response code: <Response [200]>
Page Extracted: 2
Response code: <Response [200]>
Page Extracted: 3
Response code: <Response [200]>
Page Extracted: 4
Response code: <Response [200]>
Page Extracted: 5
Response code: <Response [200]>
Page Extracted: 6
Response code: <Response [200]>
Page Extracted: 7
Response code: <Response [200]>
Page Extracted: 8
Response code: <Response [200]>
Page Extracted: 9
Response code: <Response [200]>
Page Extracted: 10
Response code: <Response [200]>
Page Extracted: 11
Response code: <Response [200]>
Page Extracted: 12
Response code: <Response [200]>
Page Extracted: 13
Response code: <Response [200]>
Page Extracted: 14
Response code: <Response [200]>
Page Extracted: 15
Response code: <Response [200]>
Page Extracted: 16
Response code: <Response [200]>
Page Extracted: 17
Response code: <Response [200]>
Page Extracted: 18
Response code: <Response [200]>
Page Extracted: 19
Response code: <Respon

In [3]:
client = bigquery.Client()

In [31]:
while len(battle_rappers_dict['Hometown']) < len(battle_rappers_dict['Battle Rapper']):
    battle_rappers_dict['Hometown'].append("N/A")

In [32]:
while len(battle_rappers_dict['Bio']) < len(battle_rappers_dict['Battle Rapper']):
    battle_rappers_dict['Bio'].append("N/A")

In [33]:
while len(battle_rappers_dict['Total Battles']) < len(battle_rappers_dict['Battle Rapper']):
    battle_rappers_dict['Total Battles'].append("N/A")

In [34]:
while len(battle_rappers_dict['Total Views']) < len(battle_rappers_dict['Battle Rapper']):
    battle_rappers_dict['Total Views'].append("N/A")

In [35]:
while len(battle_rappers_dict['Average Views']) < len(battle_rappers_dict['Battle Rapper']):
    battle_rappers_dict['Average Views'].append("N/A")

In [48]:
for i, tb in enumerate(battle_rappers_dict['Total Battles']):
    if tb == 'N/A':
        battle_rappers_dict['Total Battles'][i] = 0

In [49]:
for i, tb in enumerate(battle_rappers_dict['Total Views']):
    if tb == 'N/A':
        battle_rappers_dict['Total Views'][i] = 0

In [50]:
for i, tb in enumerate(battle_rappers_dict['Average Views']):
    if tb == 'N/A':
        battle_rappers_dict['Average Views'][i] = 0

In [55]:
df['Total Views'] = pd.to_numeric(df['Total Views'], errors='coerce').fillna(0).astype(int)
df['Average Views'] = pd.to_numeric(df['Average Views'], errors='coerce').fillna(0).astype(int)
df['Total Battles'] = pd.to_numeric(df['Total Battles'], errors='coerce').fillna(0).astype(int)

In [69]:
df = pd.DataFrame(battle_rappers_dict)

# Clean Total Views
df['Total Views'] = (
    df['Total Views']
    .astype(str)               # convert everything to string
    .str.replace(',', '')      # remove commas
    .replace('N/A', '0')      # replace N/A with 0
    .astype(int)               # convert to int
)

# Clean Total Battles similarly
df['Total Battles'] = (
    df['Total Battles']
    .astype(str)
    .str.replace(',', '')
    .replace('N/A', '0')
    .astype(int)
)

# Clean Total Battles similarly
df['Average Views'] = (
    df['Average Views']
    .astype(str)
    .str.replace(',', '')
    .replace('N/A', '0')
    .astype(int)
)

df.to_csv("cleaned_battle_rappers_complete.csv", index=False)

In [70]:
json = os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/reginaldamedee/downloads/battlerapperdb-b2f485aaaacb.json" 

In [71]:
project_id = 'battlerapperdb'
table_name = "battlerapperdb.battlerapperdb.us_based_battle_rappers"

job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.CSV, skip_leading_rows=1, autodetect=True
)

with open('cleaned_battle_rappers_complete.csv', "rb") as source_file:
    job = client.load_table_from_file(source_file, table_name, job_config=job_config)

job.result()

table = client.get_table(table_name)

print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_name
    )
)


Loaded 6285 rows and 6 columns to battlerapperdb.battlerapperdb.us_based_battle_rappers
