In [None]:
!pip install requests pandas
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib



Collecting google-api-python-client
  Downloading google_api_python_client-2.166.0-py2.py3-none-any.whl.metadata (6.6 kB)
Downloading google_api_python_client-2.166.0-py2.py3-none-any.whl (13.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-api-python-client
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 2.164.0
    Uninstalling google-api-python-client-2.164.0:
      Successfully uninstalled google-api-python-client-2.164.0
Successfully installed google-api-python-client-2.166.0


# Create directory structure




In [None]:
import os

# Base directory
base_dir = "ETL_Pipeline_Sabih_DS-59"

# Directory structure
structure = {
    "": ["etl_pipeline.py", "scheduler.py", "requirements.txt", "README.md", "load_to_db.py", "report.pdf"],
    "config": ["db_config.json"],
    "data": ["sample_data.csv", "weather_data.json", "google_sheet_sample.csv", 'weather_data.db'],
    "output": ["final_cleaned_data.csv"],
    ".github/workflows": ["ci_cd.yml"]
}

# Create directories and files
for folder, files in structure.items():
    dir_path = os.path.join(base_dir, folder)
    os.makedirs(dir_path, exist_ok=True)
    for file in files:
        file_path = os.path.join(dir_path, file)
        with open(file_path, "w") as f:
            pass  # Creates an empty file

print(f"Directory structure created under '{base_dir}'")


Directory structure created under 'ETL_Pipeline_Sabih_DS-59'


In [53]:
python_code = """

import os
import json
import pandas as pd
import requests

# # Paths
# BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# DATA_DIR = os.path.join(BASE_DIR, 'data')
# OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
# CONFIG_DIR = os.path.join(BASE_DIR, 'config')


def getSheetsData():
  # Replace with your actual spreadsheet ID and sheet ID (GID)
  spreadsheet_id = "1DIXLTQfPB76206gklGInqhRzg5KW_uHbGXmwkBVg56k"
  sheet_id = "1229579343"  # Sheet ID (GID)

  # Construct the export URL for CSV format
  url = f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/export?format=csv&gid={sheet_id}"

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful (status code 200)
  if response.status_code == 200:
    # Save to a local file
    print(response.content)
    with open('/content/ETL_Pipeline_Sabih_DS-59/data/google_sheet_sample.csv', 'wb') as f:
        f.write(response.content)
  else:
    print(f"Failed to download the CSV file. Status code: {response.status_code}")


def save_Json_data(data, filename):
    # Filter data to only include timestamp and temperature
    filtered_data = [{'timestamp': entry['timestamp'], 'temperature': entry['temperature']} for entry in data]

    # Save to JSON file
    with open(filename, 'w') as json_file:
        json.dump(filtered_data, json_file, indent=4)

    print(f"Data saved as {filename}.")


def fetch_forecast(city, api_key):
    # Use the forecast endpoint (5 day / 3 hour forecast)
    url = f'http://api.openweathermap.org/data/2.5/forecast?q={city}&appid={api_key}&units=metric'  # Use city name and `units=metric` for temperature in Celsius
    response = requests.get(url)

    if response.status_code != 200:
        raise Exception(f"Error fetching weather data: {response.status_code}")

    data = response.json()

    # Check if the response contains an error code
    if data.get("cod") != "200":
        error_message = data.get("message", "Unknown error")
        raise Exception(f"Error fetching weather data: {error_message}")

    # Extract forecast entries
    forecast_entries = []
    for entry in data.get('list', []):
        try:
            # Convert Unix timestamp to human-readable format
            timestamp = datetime.utcfromtimestamp(entry['dt']).strftime('%Y-%m-%d %H:%M:%S')
            temperature = entry['main']['temp']
            forecast_entries.append({
                'timestamp': timestamp,
                'temperature': temperature,
                'city': city
            })
        except KeyError as e:
            print(f"Key error {e} in entry: {entry}")

    return forecast_entries





def transform_data(csv_data, weather_data, sheet_data):
    print("Starting transformation process")

    # Example: Clean CSV data
    csv_data.dropna(inplace=True)

    # Example: Convert weather JSON to DataFrame (assuming a list of weather records)
    weather_df = pd.DataFrame(weather_data.get('weather', []))

    # Example: Clean Google Sheet data
    sheet_data.dropna(inplace=True)

    # Example merge (if there is a common key, here assumed as 'id')
    if 'id' in csv_data.columns and 'id' in sheet_data.columns:
        merged_data = pd.merge(csv_data, sheet_data, on='id', how='inner')
    else:
        merged_data = csv_data.copy()

    # Optionally, join weather info (here we simply add a new column with a summary)
    if not weather_df.empty:
        merged_data['weather_summary'] = weather_df.iloc[0].get('description', 'No data')
    else:
        merged_data['weather_summary'] = 'No data'

    print("Transformation complete")
    return merged_data

def load_data_to_file(dataframe, filename):
    output_path = os.path.join(OUTPUT_DIR, filename)
    print(f"Loading cleaned data to {output_path}")
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    dataframe.to_csv(output_path, index=False)



def main():
    # Extraction step


    getSheetsData() # update sheet data
    weather_data = fetch_forecast('london', 'cdc23585344f54d1d00caef6a3cffb60')


    sheet_data = extract_csv('/content/ETL_Pipeline_Sabih_DS-59/data/google_sheet_sample.csv')
    csv_data = extract_csv('/content/ETL_Pipeline_Sabih_DS-59/data/sample_data.csv')
    weather_data = extract_json('sample_weather.json')
    save_Json_data(weather_data, '/content/ETL_Pipeline_Sabih_DS-59/data/weather_data.json')

    df_combined = pd.merge(df_csv1, df_csv2, on='common_column', how='outer')
    df_combined = pd.merge(df_combined, df_json, on='common_column', how='outer')
    df_combined = pd.merge(df_combined, df_sqlite, on='common_column', how='outer')

    final_data = transform_data(csv_data, weather_data, sheet_data)

    # Load step
    load_data_to_file(final_data, 'final_cleaned_data.csv')

if __name__ == '__main__':
    main()

"""

# Open (or create) a Python file to write the code
with open('/content/ETL_Pipeline_Sabih_DS-59/etl_pipeline.py', 'w') as f:
    f.write(python_code)

print("Python code has been written to 'etl.py'")

Python code has been written to 'etl.py'


In [None]:
# Define the Python code as a string
python_code = """
import os
import json
import pandas as pd
import sqlite3

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
CONFIG_DIR = os.path.join(BASE_DIR, 'config')
OUTPUT_DIR = os.path.join(BASE_DIR, 'output')

def get_db_config():
    config_path = os.path.join(CONFIG_DIR, 'db_config.json')
    with open(config_path, 'r') as f:
        return json.load(f)

def load_csv_to_db(csv_filename, db_connection):
    csv_path = os.path.join(OUTPUT_DIR, csv_filename)
    print(f"Loading data from {csv_path} into database")
    df = pd.read_csv(csv_path)
    df.to_sql('final_data', db_connection, if_exists='replace', index=False)
    print("Data successfully loaded into database")

def main():
    config = get_db_config()
    # For demonstration, using SQLite. Replace with your actual DB connection.
    db_path = os.path.join(BASE_DIR, 'dummy_database.db')
    conn = sqlite3.connect(db_path)

    try:
        load_csv_to_db('final_cleaned_data.csv', conn)
    except Exception as e:
        print(f"Error loading data: {e}")
    finally:
        conn.close()

if __name__ == '__main__':
    main()

"""

# Open (or create) a Python file to write the code
with open('/content/ETL_Pipeline_Sabih_DS-59/load_to_db.py', 'w') as f:
    f.write(python_code)

print("Python code has been written to 'load_to_db.py'")


Python code has been written to 'load_to_db.py'


In [None]:
# Define the Python code as a string
python_code = """
import schedule
import time
import subprocess
import os

def run_etl_pipeline():
    print("Starting ETL pipeline...")
    # Run the ETL pipeline script
    subprocess.run(['python', 'etl_pipeline.py'], check=True)
    print("ETL pipeline completed.")

def main():
    # Schedule to run ETL pipeline every 10 minutes
    schedule.every(10).minutes.do(run_etl_pipeline)

    print("Scheduler started. Waiting for scheduled jobs...")
    while True:
        schedule.run_pending()
        time.sleep(1)

if __name__ == '__main__':
    main()


"""

# Open (or create) a Python file to write the code
with open('/content/ETL_Pipeline_Sabih_DS-59/scheduler.py', 'w') as f:
    f.write(python_code)

print("Python code has been written to 'pipeline.py'")


Python code has been written to 'pipeline.py'


In [None]:
{
    "database": "dummy_db",
    "user": "dummy_user",
    "password": "dummy_pass",
    "host": "localhost",
    "port": 5432,
    "api_key": "dummy_api_key"
}


In [None]:
# Define the Python code as a string
DBConfig = """{
    "database": "dummy_db",
    "user": "dummy_user",
    "password": "dummy_pass",
    "host": "localhost",
    "port": 5432,
    "api_key": "dummy_api_key"
}
"""

# Open (or create) a Python file to write the code
with open('/content/ETL_Pipeline_Sabih_DS-59/config/db_config.json', 'w') as f:
    f.write(DBConfig)

print("Python code has been written to 'DBConfig.Json'")


Python code has been written to 'DBConfig.Json'


In [None]:
# b1b15e88fa797225412429c1c50c122a1">api.openweathermap.org/data/2.5/forecast?id&appid={API key}

# Create data for csv and Mysql Database and JSon



In [52]:
import requests
import pandas as pd
import sqlite3
from datetime import datetime
import os
import csv
import sqlite3
import json

# --- CONFIGURATION ---

API_KEY = 'cdc23585344f54d1d00caef6a3cffb60'  # Replace with your real API key
CITY = 'London'
CSV_FILE = '/content/ETL_Pipeline_Sabih_DS-59/data/sample_data.csv'
SQLITE_DB_FILE = '/content/ETL_Pipeline_Sabih_DS-59/data/weather_data.db'
JSON_File = '/content/ETL_Pipeline_Sabih_DS-59/data/weather_data.json'


# --- STEP 1: Fetch weather data from OpenWeatherMap API ---

# def fetch_weather(city, api_key):

#     url = f'http://api.openweathermap.org/data/2.5/forecast?id=524901&appid={API_KEY}'
#     response = requests.get(url)
#     data = response.json()
#     print(data)

#     timestamp = datetime.utcfromtimestamp(data['dt']).strftime('%Y-%m-%d %H:%M:%S')
#     temperature = data['main']['temp']
#     return {'timestamp': timestamp, 'temperature': temperature, 'city': city}


def fetch_forecast(city, api_key):
    # Use the forecast endpoint (5 day / 3 hour forecast)
    url = f'http://api.openweathermap.org/data/2.5/forecast?q={city}&appid={api_key}&units=metric'  # Use city name and `units=metric` for temperature in Celsius
    response = requests.get(url)

    if response.status_code != 200:
        raise Exception(f"Error fetching weather data: {response.status_code}")

    data = response.json()

    # Check if the response contains an error code
    if data.get("cod") != "200":
        error_message = data.get("message", "Unknown error")
        raise Exception(f"Error fetching weather data: {error_message}")

    # Extract forecast entries
    forecast_entries = []
    for entry in data.get('list', []):
        try:
            # Convert Unix timestamp to human-readable format
            timestamp = datetime.utcfromtimestamp(entry['dt']).strftime('%Y-%m-%d %H:%M:%S')
            temperature = entry['main']['temp']
            forecast_entries.append({
                'timestamp': timestamp,
                'temperature': temperature,
                'city': city
            })
        except KeyError as e:
            print(f"Key error {e} in entry: {entry}")

    return forecast_entries


# --- STEP 2: Save to CSV ---


def create_weather_csv(data, filename='weather_data.csv'):
    # Ensure the directory exists
    os.makedirs(os.path.dirname(filename), exist_ok=True)

    # Open the file in write mode
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)

        # Write the header
        writer.writerow(['timestamp', 'temperature'])

        # Write the data
        for entry in data:
            writer.writerow([entry['timestamp'], entry['temperature']])

# --- STEP 3: Save to SQLite ---

def create_and_insert_weather_data(data):
    # Connect to SQLite database (it will create the file if it doesn't exist)
    conn = sqlite3.connect('temperature_data.db')
    cursor = conn.cursor()

    # Create table with timestamp and temperature columns if it doesn't exist
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS weather_data (
        timestamp TEXT,
        temperature REAL
    )
    ''')

    # Insert data into the table
    for record in data:
        cursor.execute('''
        INSERT INTO weather_data (timestamp, temperature)
        VALUES (?, ?)
        ''', (record['timestamp'], record['temperature']))

    # Commit the changes and close the connection
    conn.commit()

    # Optional: Verify by fetching all rows from the table
    cursor.execute('SELECT * FROM weather_data')
    rows = cursor.fetchall()

    # Print the data (optional step for verification)
    for row in rows:
        print(row)

    # Close the connection
    conn.close()



def save_temperature_data(data, filename):
    # Filter data to only include timestamp and temperature
    filtered_data = [{'timestamp': entry['timestamp'], 'temperature': entry['temperature']} for entry in data]

    # Save to JSON file
    with open(filename, 'w') as json_file:
        json.dump(filtered_data, json_file, indent=4)

    print(f"Data saved as {filename}.")




# --- MAIN ---

if __name__ == '__main__':
    weather_data = fetch_forecast(CITY, API_KEY)
    print("Weather data:", weather_data)

    create_weather_csv(weather_data, CSV_FILE)
    create_and_insert_weather_data(weather_data)
    save_temperature_data(weather_data, JSON_File)




Weather data: [{'timestamp': '2025-04-05 12:00:00', 'temperature': 12.27, 'city': 'London'}, {'timestamp': '2025-04-05 15:00:00', 'temperature': 14.68, 'city': 'London'}, {'timestamp': '2025-04-05 18:00:00', 'temperature': 11.54, 'city': 'London'}, {'timestamp': '2025-04-05 21:00:00', 'temperature': 7.23, 'city': 'London'}, {'timestamp': '2025-04-06 00:00:00', 'temperature': 5.12, 'city': 'London'}, {'timestamp': '2025-04-06 03:00:00', 'temperature': 7.03, 'city': 'London'}, {'timestamp': '2025-04-06 06:00:00', 'temperature': 7.09, 'city': 'London'}, {'timestamp': '2025-04-06 09:00:00', 'temperature': 10.97, 'city': 'London'}, {'timestamp': '2025-04-06 12:00:00', 'temperature': 13.42, 'city': 'London'}, {'timestamp': '2025-04-06 15:00:00', 'temperature': 13.06, 'city': 'London'}, {'timestamp': '2025-04-06 18:00:00', 'temperature': 10.5, 'city': 'London'}, {'timestamp': '2025-04-06 21:00:00', 'temperature': 8.11, 'city': 'London'}, {'timestamp': '2025-04-07 00:00:00', 'temperature': 7.0

# Get data from google sheets

In [None]:


# Replace with your actual spreadsheet ID and sheet ID (GID)
spreadsheet_id = "1DIXLTQfPB76206gklGInqhRzg5KW_uHbGXmwkBVg56k"
sheet_id = "1229579343"  # Sheet ID (GID)

# Construct the export URL for CSV format
url = f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/export?format=csv&gid={sheet_id}"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Save to a local file
    print(response.content)
    with open('/content/ETL_Pipeline_Sabih_DS-59/data/google_sheet_sample.csv', 'wb') as f:
        f.write(response.content)
else:
    print(f"Failed to download the CSV file. Status code: {response.status_code}")


b'timestamp,temperature\r\n2025-04-05 9:00:00,8.83\r\n2025-04-05 12:00:00,10.78\r\n2025-04-05 15:00:00,13.92\r\n2025-04-05 18:00:00,11.54\r\n2025-04-05 21:00:00,7.23\r\n2025-04-06 0:00:00,5.12\r\n2025-04-06 3:00:00,7.03\r\n2025-04-06 6:00:00,7.09\r\n2025-04-06 9:00:00,10.97\r\n2025-04-06 12:00:00,13.42\r\n2025-04-06 15:00:00,13.06\r\n2025-04-06 18:00:00,10.5\r\n2025-04-06 21:00:00,8.11\r\n2025-04-07 0:00:00,7.01\r\n2025-04-07 3:00:00,6.36\r\n2025-04-07 6:00:00,5.79\r\n2025-04-07 9:00:00,10.64\r\n2025-04-07 12:00:00,14.41\r\n2025-04-07 15:00:00,15.02\r\n2025-04-07 18:00:00,12.26\r\n2025-04-07 21:00:00,9.14\r\n2025-04-08 0:00:00,7.79\r\n2025-04-08 3:00:00,6.89\r\n2025-04-08 6:00:00,6.47\r\n2025-04-08 9:00:00,11.29\r\n2025-04-08 12:00:00,15.09\r\n2025-04-08 15:00:00,16.56\r\n2025-04-08 18:00:00,14.28\r\n2025-04-08 21:00:00,9.56\r\n2025-04-09 0:00:00,7.54\r\n2025-04-09 3:00:00,5.88\r\n2025-04-09 6:00:00,5.5\r\n2025-04-09 9:00:00,9.5\r\n2025-04-09 12:00:00,14.14\r\n2025-04-09 15:00:00,15.79