###  Web Scraping Weather Data for European Capitals and Saving to CSV

In [8]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import time

# List of European capitals and their URLs
european_capitals = {
    "Paris": "https://www.timeanddate.com/weather/france/paris",
    "Berlin": "https://www.timeanddate.com/weather/germany/berlin",
    "Madrid": "https://www.timeanddate.com/weather/spain/madrid",
    "Rome": "https://www.timeanddate.com/weather/italy/rome",
    "London": "https://www.timeanddate.com/weather/uk/london",
    "Amsterdam": "https://www.timeanddate.com/weather/netherlands/amsterdam",
    "Brussels": "https://www.timeanddate.com/weather/belgium/brussels",
    "Vienna": "https://www.timeanddate.com/weather/austria/vienna",
    "Copenhagen": "https://www.timeanddate.com/weather/denmark/copenhagen",
    "Athens": "https://www.timeanddate.com/weather/greece/athens",
    "Lisbon": "https://www.timeanddate.com/weather/portugal/lisbon",
    "Dublin": "https://www.timeanddate.com/weather/ireland/dublin",
    "Stockholm": "https://www.timeanddate.com/weather/sweden/stockholm",
    "Oslo": "https://www.timeanddate.com/weather/norway/oslo",
    "Helsinki": "https://www.timeanddate.com/weather/finland/helsinki",
    "Warsaw": "https://www.timeanddate.com/weather/poland/warsaw",
    "Prague": "https://www.timeanddate.com/weather/czech-republic/prague",
    "Budapest": "https://www.timeanddate.com/weather/hungary/budapest",
    "Bratislava": "https://www.timeanddate.com/weather/slovakia/bratislava",
    "Ljubljana": "https://www.timeanddate.com/weather/slovenia/ljubljana",
    "Zagreb": "https://www.timeanddate.com/weather/croatia/zagreb",
    "Sofia": "https://www.timeanddate.com/weather/bulgaria/sofia",
    "Bucharest": "https://www.timeanddate.com/weather/romania/bucharest",
    "Tallinn": "https://www.timeanddate.com/weather/estonia/tallinn",
    "Riga": "https://www.timeanddate.com/weather/latvia/riga",
    "Vilnius": "https://www.timeanddate.com/weather/lithuania/vilnius",
    "Bern": "https://www.timeanddate.com/weather/switzerland/bern",
    "Reykjavik": "https://www.timeanddate.com/weather/iceland/reykjavik",
    "Luxembourg": "https://www.timeanddate.com/weather/luxembourg/luxembourg",
    "Valletta": "https://www.timeanddate.com/weather/malta/valletta"
}

def get_weather(city, url):
    """Fetch the current temperature of a given city from Time and Date."""
    headers = {"User-Agent": "Mozilla/5.0"}

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        temperature = soup.find("div", class_="h2").text.strip()

        # Get current date
        date = datetime.now().strftime("%Y-%m-%d")

        return {"Date": date, "City": city, "Temperature": temperature}
    else:
        print(f"Failed to retrieve data for {city}")
        return None

def save_weather_data():
    """Fetch and save temperature data for all European capitals."""
    weather_data = []

    for city, url in european_capitals.items():
        data = get_weather(city, url)
        if data:
            weather_data.append(data)
        time.sleep(2)  # To avoid excessive requests to the server

    # Convert to DataFrame
    df = pd.DataFrame(weather_data)

    # Save to CSV (append if file exists)
    file_name = "europe_weatherr.csv"
    df.to_csv(file_name, mode='a', index=False, header=not pd.io.common.file_exists(file_name))

    print(f"Temperature data saved to {file_name}")

if __name__ == "__main__":
    save_weather_data()


Temperature data saved to europe_weatherr.csv


Displaying the data

In [9]:
df=pd.read_csv('europe_weatherr.csv')
df.head()

Unnamed: 0,Date,City,Temperature
0,2025-03-14,Paris,39 °F
1,2025-03-14,Berlin,46 °F
2,2025-03-14,Madrid,48 °F
3,2025-03-14,Rome,66 °F
4,2025-03-14,London,46 °F


### Installing and Setting Up Hadoop in Google Colab

In [12]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz
!tar -xvzf hadoop-3.3.6.tar.gz > /dev/null
!mv hadoop-3.3.6 hadoop



0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Connecting to security.ubuntu.com (185.125.190.82)] [Connecting to cloud.r-project.org] [Connect                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
0% [2 InRelease 8,380 B/128 kB 7%] [Connecting to security.ubuntu.com (185.125.190.82)] [Connected t                                                                                                    Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 https://developer.download.nvi

### Configuring Hadoop Environment Variables in Google Colab

In [13]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["HADOOP_HOME"] = "/content/hadoop"
os.environ["PATH"] += os.pathsep + "/content/hadoop/bin" + os.pathsep + "/content/hadoop/sbin"


### Formatting HDFS, Starting Hadoop DFS, and Uploading Data to HDFS

In [14]:
!hadoop/bin/hdfs namenode -format
!hadoop/sbin/start-dfs.sh
!hadoop/bin/hdfs dfs -mkdir -p /input
!hadoop/bin/hdfs dfs -put europe_weather.csv /input/
!hadoop/bin/hdfs dfs -ls /input


2025-03-14 15:19:42,217 INFO namenode.NameNode: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = 5494a0949530/172.28.0.12
STARTUP_MSG:   args = [-format]
STARTUP_MSG:   version = 3.3.6
STARTUP_MSG:   classpath = /content/hadoop/etc/hadoop:/content/hadoop/share/hadoop/common/lib/snappy-java-1.1.8.2.jar:/content/hadoop/share/hadoop/common/lib/netty-codec-socks-4.1.89.Final.jar:/content/hadoop/share/hadoop/common/lib/kerby-pkix-1.0.1.jar:/content/hadoop/share/hadoop/common/lib/netty-handler-proxy-4.1.89.Final.jar:/content/hadoop/share/hadoop/common/lib/commons-compress-1.21.jar:/content/hadoop/share/hadoop/common/lib/checker-qual-2.5.2.jar:/content/hadoop/share/hadoop/common/lib/hadoop-annotations-3.3.6.jar:/content/hadoop/share/hadoop/common/lib/commons-io-2.8.0.jar:/content/hadoop/share/hadoop/common/lib/netty-codec-http2-4.1.89.Final.jar:/content/hadoop/share/hadoop/common/lib/jetty-io-9.4.51.v20230217.jar:

### Defining the Mapper Script (map.py) for Hadoop

In [40]:
%%writefile map.py
#!/usr/bin/env python3
import sys

for line in sys.stdin:
    fields = line.strip().split(",")

    if len(fields) >= 3 and fields[1].strip().lower() != "city":
        try:
            city = fields[1].strip()
            temp_str = fields[2].strip()

            print(f"DEBUG: City={city}, Raw Temperature={temp_str}", file=sys.stderr)

            # Remove °F but keep Fahrenheit
            temp_cleaned = temp_str.replace("°F", "").replace("F", "").strip()

            temperature = float(temp_cleaned)

            print(f"DEBUG: City={city}, Temperature in °F={temperature:.2f}", file=sys.stderr)

            print(f"{city}\t{temperature:.2f}")
        except ValueError:
            print(f"ERROR: Could not convert {temp_str} to float", file=sys.stderr)
            continue


Overwriting map.py


In [41]:
!cat europe_weather.csv | python3 map.py


DEBUG: City=Paris, Raw Temperature=39 °F
DEBUG: City=Paris, Temperature in °F=39.00
Paris	39.00
DEBUG: City=Berlin, Raw Temperature=46 °F
DEBUG: City=Berlin, Temperature in °F=46.00
Berlin	46.00
DEBUG: City=Madrid, Raw Temperature=48 °F
DEBUG: City=Madrid, Temperature in °F=48.00
Madrid	48.00
DEBUG: City=Rome, Raw Temperature=66 °F
DEBUG: City=Rome, Temperature in °F=66.00
Rome	66.00
DEBUG: City=London, Raw Temperature=46 °F
DEBUG: City=London, Temperature in °F=46.00
London	46.00
DEBUG: City=Amsterdam, Raw Temperature=43 °F
DEBUG: City=Amsterdam, Temperature in °F=43.00
Amsterdam	43.00
DEBUG: City=Brussels, Raw Temperature=43 °F
DEBUG: City=Brussels, Temperature in °F=43.00
Brussels	43.00
DEBUG: City=Vienna, Raw Temperature=43 °F
DEBUG: City=Vienna, Temperature in °F=43.00
Vienna	43.00
DEBUG: City=Copenhagen, Raw Temperature=36 °F
DEBUG: City=Copenhagen, Temperature in °F=36.00
Copenhagen	36.00
DEBUG: City=Athens, Raw Temperature=72 °F
DEBUG: City=Athens, Temperature in °F=72.00
Athen

###  Defining the Reducer Script (reduce.py) for Hadoop

In [22]:
%%writefile reduce.py
#!/usr/bin/env python3
import sys

min_temp = float("inf")
max_temp = float("-inf")
min_city = None
max_city = None

for line in sys.stdin:
    try:
        city, temp = line.strip().split("\t")
        temp = float(temp)

        if temp < min_temp:
            min_temp = temp
            min_city = city

        if temp > max_temp:
            max_temp = temp
            max_city = city
    except ValueError:
        continue  # Skip invalid data

print(f"Coldest City: {min_city}\t{min_temp}°C")
print(f"Hottest City: {max_city}\t{max_temp}°C")


Overwriting reduce.py


###  Running Hadoop Streaming Job with Mapper and Reducer Scripts

In [43]:
!hadoop/bin/hadoop jar hadoop/share/hadoop/tools/lib/hadoop-streaming-*.jar \
    -input /input/europe_weather.csv \
    -output /output \
    -mapper "python3 map.py" \
    -reducer "python3 reduce.py"


2025-03-14 15:35:02,220 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2025-03-14 15:35:02,340 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2025-03-14 15:35:02,341 INFO impl.MetricsSystemImpl: JobTracker metrics system started
2025-03-14 15:35:02,361 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!
2025-03-14 15:35:02,658 INFO mapred.FileInputFormat: Total input files to process : 1
2025-03-14 15:35:02,681 INFO mapreduce.JobSubmitter: number of splits:1
2025-03-14 15:35:02,873 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local104396934_0001
2025-03-14 15:35:02,873 INFO mapreduce.JobSubmitter: Executing with tokens: []
2025-03-14 15:35:03,141 INFO mapreduce.Job: The url to track the job: http://localhost:8080/
2025-03-14 15:35:03,144 INFO mapreduce.Job: Running job: job_local104396934_0001
2025-03-14 15:35:03,153 INFO mapred.LocalJobRunner: OutputCommitter set in config null
2025-03-1

### Displaying the Output of the Hadoop Streaming Job from HDFS










In [44]:
!hadoop/bin/hdfs dfs -cat /output/part-00000


Coldest City: Helsinki	32.0°C
Hottest City: Athens	72.0°C
