# Web Scraping and Geotagging Project

## Setup and Imports

In [107]:
import requests
import pandas as pd
import folium
from folium.plugins import MarkerCluster
from sqlalchemy import create_engine
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim


## Web Scraping 

In [108]:
# Target URL for web scraping
url = 'https://www.scrapethissite.com/pages/simple/'

# Sending a GET request to the URL
response = requests.get(url)

# Parsing the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Scraping data from the specified divs
divs = soup.find_all('div', class_='col-md-4 country')

# Initializing lists to store scraped data
countries = []
capitals = []
populations = []
areas = []

# Looping through each country div to extract data
for country in divs:
    # Scraping country name
    country_name = country.find('h3', class_='country-name').text.strip() if country.find('h3', class_='country-name') else "CHECK"

    # Scraping capital
    capital = country.find('span', class_='country-capital').text.strip() if country.find('span', class_='country-capital') else "CHECK"

    # Scraping population
    population_element = country.find('strong', text="Population:")
    population = population_element.find_next_sibling().text.strip() if population_element else "CHECK"

    # Scraping area
    area = country.find('span', class_='country-area').text.strip() if country.find('span', class_='country-area') else "CHECK"

    # Appending scraped data to respective lists
    countries.append(country_name)
    capitals.append(capital)
    populations.append(population)
    areas.append(area)

# Creating a DataFrame from the scraped data
data = {
    "Country": countries,
    "Capital": capitals,
    "Population": populations,
    "Areas": areas
}
df = pd.DataFrame(data)

df


  population_element = country.find('strong', text="Population:")


Unnamed: 0,Country,Capital,Population,Areas
0,Andorra,Andorra la Vella,84000,468.0
1,United Arab Emirates,Abu Dhabi,4975593,82880.0
2,Afghanistan,Kabul,29121286,647500.0
3,Antigua and Barbuda,St. John's,86754,443.0
4,Anguilla,The Valley,13254,102.0
...,...,...,...,...
245,Yemen,Sanaa,23495361,527970.0
246,Mayotte,Mamoudzou,159042,374.0
247,South Africa,Pretoria,49000000,1219912.0
248,Zambia,Lusaka,13460305,752614.0


## Geotagging 

In [109]:
# Combining city and country for geocoding
city_country = [f"{capital}, {country}" for capital, country in zip(capitals, countries)]

# Initializing geocoder
geolocator = Nominatim(user_agent="sample")

# Initializing lists for latitude and longitude
latitudes = []
longitudes = []

# Looping through city_country list for geocoding
for location in city_country:
    try:
        result = geolocator.geocode(location)
        if result:
            latitudes.append(result.latitude)
            longitudes.append(result.longitude)
        else:
            latitudes.append(None)
            longitudes.append(None)
    except Exception as e:
        print(f"Error geocoding {location}: {e}")

# Adding geotag data to the DataFrame
df['Latitude'] = latitudes
df['Longitude'] = longitudes

df


Unnamed: 0,Country,Capital,Population,Areas,Latitude,Longitude
0,Andorra,Andorra la Vella,84000,468.0,42.506939,1.521247
1,United Arab Emirates,Abu Dhabi,4975593,82880.0,24.453835,54.377401
2,Afghanistan,Kabul,29121286,647500.0,34.526011,69.177684
3,Antigua and Barbuda,St. John's,86754,443.0,17.118457,-61.844851
4,Anguilla,The Valley,13254,102.0,18.214586,-63.051776
...,...,...,...,...,...,...
245,Yemen,Sanaa,23495361,527970.0,15.353857,44.205884
246,Mayotte,Mamoudzou,159042,374.0,-12.780414,45.227976
247,South Africa,Pretoria,49000000,1219912.0,-25.745928,28.187910
248,Zambia,Lusaka,13460305,752614.0,-15.357609,29.165309


## Database Upload

In [110]:
# Database connection details
db_name = "DATABASE NAME"
db_user = "DATABASE USER"
db_password = "USER PASSWORD"
db_host = "localhost"
db_port = "5432"

# Creating connection string
connection_str = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'

# Creating engine to connect to PostgreSQL
engine = create_engine(connection_str)

# Table name for database upload
table_name = 'Countries_info'

# Uploading DataFrame to PostgreSQL database
df.to_sql(table_name, engine, if_exists='append', index=False)

# Disposing engine after database upload
engine.dispose()


## Data Mapping

In [111]:
m = folium.Map(location=df[["Latitude","Longitude"]].mean().to_list(), zoom_start=2)
marker_cluster = MarkerCluster().add_to(m)
df_filtered = df.dropna(subset=["Latitude","Longitude"])

for i, r in df_filtered.iterrows():
    location = (r["Latitude"], r["Longitude"])
    folium.Marker(location=location,
                  popup=r['Capital'],
                  tooltip=r['Capital']) \
        .add_to(marker_cluster)
    
m.save("map.html")

m