<a href="https://colab.research.google.com/github/Sereenamariyam/Data-Analysis/blob/main/Data_Ingestion_batch_Vs_Stream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Batch Ingestion (All-at-Once)

In [None]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/velicki/Weather_Data_Analysis_Project/main/Weather_Data.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,Date/Time,Temp_C,Dew Point Temp_C,Rel Hum_%,Wind Speed_km/h,Visibility_km,Press_kPa,Weather
0,1/1/2012 0:00,-1.8,-3.9,86,4,8.0,101.24,Fog
1,1/1/2012 1:00,-1.8,-3.7,87,4,8.0,101.24,Fog
2,1/1/2012 2:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"
3,1/1/2012 3:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog"
4,1/1/2012 4:00,-1.5,-3.3,88,7,4.8,101.23,Fog


In [None]:

#PREPROCESSING

# Rename columns for easier access
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace('/', '_')

# Convert Date/Time to datetime object
df['Date_Time'] = pd.to_datetime(df['Date_Time'], errors='coerce')

# Drop rows with missing Date_Time or Temp_C
df = df.dropna(subset=['Date_Time', 'Temp_C'])

# Optional: Convert temperature to numeric
df['Temp_C'] = pd.to_numeric(df['Temp_C'], errors='coerce')

df.shape
df.head()

Unnamed: 0,Date_Time,Temp_C,Dew_Point_Temp_C,Rel_Hum_%,Wind_Speed_km_h,Visibility_km,Press_kPa,Weather
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog"
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,Fog


Stream Ingestion (Row-by-Row Simulation)

In [None]:
import pandas as pd
import time

# Step 1: Load the original data
url = "https://raw.githubusercontent.com/velicki/Weather_Data_Analysis_Project/refs/heads/main/Weather_Data.csv"
df = pd.read_csv(url)

# PREPROCESSING (added for streaming section)
# Rename columns for easier access
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace('/', '_')

# Convert Date/Time to datetime object
df['Date_Time'] = pd.to_datetime(df['Date_Time'], errors='coerce')

# Drop rows with missing Date_Time or Temp_C
df = df.dropna(subset=['Date_Time', 'Temp_C'])

# Optional: Convert temperature to numeric
df['Temp_C'] = pd.to_numeric(df['Temp_C'], errors='coerce')

In [None]:
def alert_high_temp(row):
    if row['Temp_C'] > 5:
        print(f"⚠️ ALERT: High temperature detected at {row['Date_Time']} - {row['Temp_C']}°C")

# Apply during streaming
def stream_with_alert(data, delay=0.5):
    for idx, row in data.iterrows():
        # print(f"{row['Date_Time']} - Temp: {row['Temp_C']}°C")
        print(f"Streaming row {idx} → {row.to_dict()}")
        alert_high_temp(row)
        time.sleep(delay)


In [None]:
stream_with_alert(df)

Streaming row 0 → {'Date_Time': Timestamp('2012-01-01 00:00:00'), 'Temp_C': -1.8, 'Dew_Point_Temp_C': -3.9, 'Rel_Hum_%': 86, 'Wind_Speed_km_h': 4, 'Visibility_km': 8.0, 'Press_kPa': 101.24, 'Weather': 'Fog'}
Streaming row 1 → {'Date_Time': Timestamp('2012-01-01 01:00:00'), 'Temp_C': -1.8, 'Dew_Point_Temp_C': -3.7, 'Rel_Hum_%': 87, 'Wind_Speed_km_h': 4, 'Visibility_km': 8.0, 'Press_kPa': 101.24, 'Weather': 'Fog'}
Streaming row 2 → {'Date_Time': Timestamp('2012-01-01 02:00:00'), 'Temp_C': -1.8, 'Dew_Point_Temp_C': -3.4, 'Rel_Hum_%': 89, 'Wind_Speed_km_h': 7, 'Visibility_km': 4.0, 'Press_kPa': 101.26, 'Weather': 'Freezing Drizzle,Fog'}
Streaming row 3 → {'Date_Time': Timestamp('2012-01-01 03:00:00'), 'Temp_C': -1.5, 'Dew_Point_Temp_C': -3.2, 'Rel_Hum_%': 88, 'Wind_Speed_km_h': 6, 'Visibility_km': 4.0, 'Press_kPa': 101.27, 'Weather': 'Freezing Drizzle,Fog'}
Streaming row 4 → {'Date_Time': Timestamp('2012-01-01 04:00:00'), 'Temp_C': -1.5, 'Dew_Point_Temp_C': -3.3, 'Rel_Hum_%': 88, 'Wind_Spe

KeyboardInterrupt: 