Web scripting

In [1]:
import requests
import pandas as pd
import numpy as np
import time
import os
from datetime import datetime, timedelta

# Configuration
TARGET_ROWS = 10000000
CSV_FILENAME = "tamil_nadu_rain_prediction_rows.csv"
SELECTED_FEATURES = [
    "temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation",
    "rain", "surface_pressure", "cloud_cover", "cloud_cover_low",
    "wind_speed_10m", "wind_direction_10m"
]

TAMIL_NADU_CITIES = [
    ("Ariyalur", 11.1375, 79.0758),
    ("Chennai", 13.0827, 80.2707),
    ("Coimbatore", 11.0168, 76.9558),
    ("Cuddalore", 11.7447, 79.7680),
    ("Dharmapuri", 12.1211, 78.1582),
    ("Dindigul", 10.3621, 77.9765),
    ("Erode", 11.3410, 77.7172),
    ("Kallakurichi", 11.7400, 78.9600),
    ("Kanchipuram", 12.8397, 79.7000),
    ("Kanyakumari", 8.0883, 77.5385),
    ("Karur", 10.9574, 78.0809),
    ("Krishnagiri", 12.5186, 78.2137),
    ("Madurai", 9.9252, 78.1198),
    ("Nagapattinam", 10.7667, 79.8417),
    ("Namakkal", 11.2212, 78.1652),
    ("Nilgiris", 11.4090, 76.6935),
    ("Perambalur", 11.2340, 78.8822),
    ("Pudukkottai", 10.3800, 78.8200),
    ("Ramanathapuram", 9.3716, 78.8307),
    ("Ranipet", 12.9254, 79.3323),
    ("Salem", 11.6643, 78.1460),
    ("Sivaganga", 9.8432, 78.4809),
    ("Tenkasi", 8.9601, 77.3153),
    ("Thanjavur", 10.7869, 79.1378),
    ("Theni", 10.0104, 77.4768),
    ("Thoothukudi", 8.7642, 78.1348),
    ("Tiruchirappalli", 10.7905, 78.7047),
    ("Tirunelveli", 8.7139, 77.7567),
    ("Tirupathur", 12.4959, 78.5679),
    ("Tiruppur", 11.1085, 77.3411),
    ("Tiruvallur", 13.1449, 79.9087),
    ("Tiruvannamalai", 12.2262, 79.0746),
    ("Tiruvarur", 10.7726, 79.6368),
    ("Vellore", 12.9165, 79.1325),
    ("Viluppuram", 11.9427, 79.4973),
    ("Virudhunagar", 9.5827, 77.9807),
    ("Chengalpattu", 12.6821, 79.9769),
    ("Mayiladuthurai", 11.1035, 79.6550)
]

BASE_URL = "https://archive-api.open-meteo.com/v1/archive"
params_template = {
    "hourly": ",".join(SELECTED_FEATURES),
    "timezone": "Asia/Kolkata",
    "models": "best_match"
}

def fetch_weather_data(lat, lon, start_date, end_date):
    params = params_template.copy()
    params.update({
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,
        "end_date": end_date
    })
    try:
        response = requests.get(BASE_URL, params=params, timeout=45)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {lat},{lon}: {e}")
        return None

def process_and_label_data(data, city_name):
    hourly_data = data.get("hourly", {})
    df = pd.DataFrame(hourly_data)
    if df.empty:
        return df

    df["time"] = pd.to_datetime(df["time"])
    df["date"] = df["time"].dt.date
    df["city"] = city_name

    daily_rain = df.groupby('date')['precipitation'].sum().reset_index()
    daily_rain['rain_tomorrow'] = (daily_rain['precipitation'].shift(-1) > 0).astype(int)

    df = df.merge(daily_rain[['date', 'rain_tomorrow']], on='date', how='left')
    df.drop(columns=['date'], inplace=True)

    for col in SELECTED_FEATURES:
        if col not in df.columns:
            df[col] = np.nan

    df = df[SELECTED_FEATURES + ['time', 'city', 'rain_tomorrow']]
    df = df.dropna(subset=['rain_tomorrow'])
    return df

def get_last_date_for_city(city_name, existing_df):
    """Get the last available date for a specific city"""
    if existing_df.empty:
        return datetime(2020, 1, 1).date()

    city_data = existing_df[existing_df['city'] == city_name]
    if city_data.empty:
        return datetime(2020, 1, 1).date()

    max_date = pd.to_datetime(city_data['time']).max().date()
    return max_date + timedelta(days=1)  # Start from next day

def check_duplicates(existing_df, new_df):
    """Check for duplicates between existing and new data"""
    if existing_df.empty or new_df.empty:
        return new_df

    # Create unique identifiers for both dataframes
    existing_df['unique_id'] = existing_df['time'].astype(str) + '_' + existing_df['city']
    new_df['unique_id'] = new_df['time'].astype(str) + '_' + new_df['city']

    # Filter out duplicates
    mask = ~new_df['unique_id'].isin(existing_df['unique_id'])
    result = new_df[mask].copy()

    # Clean up temporary columns
    result.drop(columns=['unique_id'], inplace=True)
    existing_df.drop(columns=['unique_id'], inplace=True)

    return result

def main():
    # Load existing data or create empty dataframe
    if os.path.exists(CSV_FILENAME):
        existing_df = pd.read_csv(CSV_FILENAME)
        existing_df['time'] = pd.to_datetime(existing_df['time'])
        total_rows_written = len(existing_df)
        print(f"Resuming from existing file with {total_rows_written} rows...")
    else:
        existing_df = pd.DataFrame()
        total_rows_written = 0
        print("Starting new data collection...")

    # Process each city in order
    city_index = 0

    while total_rows_written < TARGET_ROWS:
        city_name, lat, lon = TAMIL_NADU_CITIES[city_index]

        # Get the last date we have data for this city
        last_date = get_last_date_for_city(city_name, existing_df)
        current_date = datetime.now().date()

        # If we already have data up to today, skip this city
        if last_date >= current_date:
            print(f"Skipping {city_name} - data already up to date (last date: {last_date})")
            city_index = (city_index + 1) % len(TAMIL_NADU_CITIES)
            continue

        # Set date range (max 30 days per API call)
        start_date = last_date
        end_date = min(start_date + timedelta(days=30), current_date)

        print(f"Fetching data for {city_name} from {start_date} to {end_date}")
        data = fetch_weather_data(lat, lon, str(start_date), str(end_date))

        if data is None:
            print(f"API call failed for {city_name}. Moving to next city...")
            city_index = (city_index + 1) % len(TAMIL_NADU_CITIES)
            time.sleep(2)
            continue

        df_chunk = process_and_label_data(data, city_name)
        if df_chunk.empty:
            print(f"No data retrieved for {city_name} in this period. Moving to next city...")
            city_index = (city_index + 1) % len(TAMIL_NADU_CITIES)
            time.sleep(1)
            continue

        # Check for duplicates using the fixed function
        df_chunk = check_duplicates(existing_df, df_chunk)

        if df_chunk.empty:
            print(f"All fetched data for {city_name} are duplicates. Moving to next city...")
            city_index = (city_index + 1) % len(TAMIL_NADU_CITIES)
            continue

        # Append to CSV
        write_header = not os.path.exists(CSV_FILENAME)
        df_chunk.to_csv(CSV_FILENAME, mode='a', header=write_header, index=False)

        # Update existing_df
        existing_df = pd.concat([existing_df, df_chunk], ignore_index=True)

        rows_added = len(df_chunk)
        total_rows_written += rows_added
        print(f"Added {rows_added} rows for {city_name}. Total: {total_rows_written}/{TARGET_ROWS}")

        # Move to next city
        city_index = (city_index + 1) % len(TAMIL_NADU_CITIES)
        time.sleep(1.5)

if __name__ == "__main__":
    main()

Resuming from existing file with 1891488 rows...
Skipping Ariyalur - data already up to date (last date: 2025-09-05)
Skipping Chennai - data already up to date (last date: 2025-09-05)
Skipping Coimbatore - data already up to date (last date: 2025-09-05)
Skipping Cuddalore - data already up to date (last date: 2025-09-05)
Skipping Dharmapuri - data already up to date (last date: 2025-09-05)
Skipping Dindigul - data already up to date (last date: 2025-09-05)
Skipping Erode - data already up to date (last date: 2025-09-05)
Skipping Kallakurichi - data already up to date (last date: 2025-09-05)
Skipping Kanchipuram - data already up to date (last date: 2025-09-05)
Skipping Kanyakumari - data already up to date (last date: 2025-09-05)
Skipping Karur - data already up to date (last date: 2025-09-05)
Skipping Krishnagiri - data already up to date (last date: 2025-09-05)
Skipping Madurai - data already up to date (last date: 2025-09-05)
Skipping Nagapattinam - data already up to date (last dat

KeyboardInterrupt: 

Preprocessing data

In [2]:
#imports
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import pyarrow

#mounting drive
csv = pd.read_csv("tamil_nadu_rain_prediction_rows.csv")
# Drop NaN
csv.dropna(subset=csv.columns, inplace=True)

# Drop last 1470 rows due to nan value
csv = csv.iloc[:-1470].reset_index(drop=True)

#encoding object dt
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(csv[["city"]])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(["city"]))

#making numeric column and raw_df
numeric_col = csv.select_dtypes(include=['number'])
Raw_df = pd.concat([numeric_col,encoded_df ,csv["time"]], axis=1)

#saving MinMaxScaler in scaler variable
scaler = MinMaxScaler()

# Scale all except last column
scaled_values = scaler.fit_transform(Raw_df.iloc[:, :-1])
scaled_df = pd.DataFrame(scaled_values, columns=Raw_df.columns[:-1])

# Add back last column
scaled_df[Raw_df.columns[-1]] = Raw_df.iloc[:, -1].values

#finding duplicates
df_no_duplicates = scaled_df.drop_duplicates()

Final_df_preprocessing = df_no_duplicates.reset_index(drop=True)

#saving it
Final_df_preprocessing.to_parquet("TamilNaduWeather_AfterPreprocessing(NoDup,properindex).parquet")

Model training and testing

In [3]:
#imports
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#save dataframe(parquet)
df_model = pd.read_parquet("TamilNaduWeather_AfterPreprocessing(NoDup,properindex).parquet")

#seperate input and target columns
input_col = df_model.drop(["time", "rain_tomorrow"], axis=1)
target_col = df_model["rain_tomorrow"]

#seperate input and target columns for train and test
input_train, input_test, target_train, target_test = train_test_split(
    input_col, target_col, test_size=0.2, random_state=72)

#Train model
model = LogisticRegression(random_state=42)
model.fit(input_train,target_train)

#Model prediction
prediction_train = model.predict(input_train)
prediction_test = model.predict(input_test)

#Model prediction accuracy
accuracy_score(target_train,prediction_train)
accuracy_score(target_test,prediction_test)

#Dumb ai
majority_class = target_train.mode()[0]

#Dumb ai train
y_pred_dumb_train = np.full_like(target_train, fill_value=majority_class)
accuracy_score(target_train,y_pred_dumb_train)

#Dumb ai test
y_pred_dumb_test = np.full_like(target_test, fill_value=majority_class)
accuracy_score(target_test,y_pred_dumb_test)

#How much was Model prediction of test higher than Dumb ai test accuracy
How_better = accuracy_score(target_test,prediction_test) -accuracy_score(target_test,y_pred_dumb_test)
print(f"This model was better than dumb ai by {How_better * 100 :.2f}%")
print(f"{accuracy_score(target_test,prediction_test)* 100 :.2f}%")


This model was better than dumb ai by 9.70%
73.75%


Saving all data and model with joblib

In [4]:
#imports
import joblib

#making dictionary
TN_rainPredicton_data = {
"model" : model,
"Final_df_preprocessing" : Final_df_preprocessing}

#dumping it in drive
joblib.dump(TN_rainPredicton_data,"TN_rainPredicton_data.joblib")

['TN_rainPredicton_data.joblib']

Predicting Real-world data

In [5]:
import requests
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

CITY_COORDS = {
    "Ariyalur": (11.1375, 79.0758),
    "Chengalpattu": (12.6821, 79.9769),
    "Chennai": (13.0827, 80.2707),
    "Coimbatore": (11.0168, 76.9558),
    "Cuddalore": (11.7447, 79.7680),
    "Dharmapuri": (12.1211, 78.1582),
    "Dindigul": (10.3621, 77.9765),
    "Erode": (11.3410, 77.7172),
    "Kallakurichi": (11.7400, 78.9600),
    "Kanchipuram": (12.8397, 79.7000),
    "Kanyakumari": (8.0883, 77.5385),
    "Karur": (10.9574, 78.0809),
    "Krishnagiri": (12.5186, 78.2137),
    "Madurai": (9.9252, 78.1198),
    "Mayiladuthurai": (11.1035, 79.6550),
    "Nagapattinam": (10.7667, 79.8417),
    "Namakkal": (11.2212, 78.1652),
    "Nilgiris": (11.4090, 76.6935),
    "Perambalur": (11.2340, 78.8822),
    "Pudukkottai": (10.3800, 78.8200),
    "Ramanathapuram": (9.3716, 78.8307),
    "Ranipet": (12.9254, 79.3323),
    "Salem": (11.6643, 78.1460),
    "Sivaganga": (9.8432, 78.4809),
    "Tenkasi": (8.9601, 77.3153),
    "Thanjavur": (10.7869, 79.1378),
    "Theni": (10.0104, 77.4768),
    "Thoothukudi": (8.7642, 78.1348),
    "Tiruchirappalli": (10.7905, 78.7047),
    "Tirunelveli": (8.7139, 77.7567),
    "Tirupathur": (12.4959, 78.5679),
    "Tiruppur": (11.1085, 77.3411),
    "Tiruvallur": (13.1449, 79.9087),
    "Tiruvannamalai": (12.2262, 79.0746),
    "Tiruvarur": (10.7726, 79.6368),
    "Vellore": (12.9165, 79.1325),
    "Viluppuram": (11.9427, 79.4973),
    "Virudhunagar": (9.5827, 77.9807),
}

def get_weather(city):
    if city not in CITY_COORDS:
        city = "Chennai"
        print("City not found. Defaulting to Chennai.")

    lat, lon = CITY_COORDS[city]
    url = f"https://api.open-meteo.com/v1/forecast?latitude={lat}&longitude={lon}&current=temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,surface_pressure,cloud_cover,cloud_cover_low,wind_speed_10m,wind_direction_10m&timezone=auto"

    data = requests.get(url).json()['current']
    data['city'] = city
    return pd.DataFrame([data])

def preprocess_data(df):
    # Remove unwanted columns
    df = df.drop(columns=['time', 'interval'], errors='ignore')

    # Exact city list from training dataset (38 cities)
    expected_cities = [
        "Ariyalur","Chengalpattu","Chennai","Coimbatore","Cuddalore",
        "Dharmapuri","Dindigul","Erode","Kallakurichi","Kanchipuram",
        "Kanyakumari","Karur","Krishnagiri","Madurai","Mayiladuthurai",
        "Nagapattinam","Namakkal","Nilgiris","Perambalur","Pudukkottai",
        "Ramanathapuram","Ranipet","Salem","Sivaganga","Tenkasi",
        "Thanjavur","Theni","Thoothukudi","Tiruchirappalli","Tirunelveli",
        "Tirupathur","Tiruppur","Tiruvallur","Tiruvannamalai","Tiruvarur",
        "Vellore","Viluppuram","Virudhunagar"
    ]

    # One-hot encode city
    encoder = OneHotEncoder(sparse_output=False, categories=[expected_cities], handle_unknown='ignore')
    city_encoded = encoder.fit_transform(df[['city']])
    city_cols = [f"city_{c}" for c in expected_cities]
    city_df = pd.DataFrame(city_encoded, columns=city_cols)

    # Numeric features
    numeric_cols = [
        'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
        'precipitation', 'rain', 'surface_pressure', 'cloud_cover',
        'cloud_cover_low', 'wind_speed_10m', 'wind_direction_10m'
    ]

    existing_numeric_cols = [col for col in numeric_cols if col in df.columns]
    scaler = MinMaxScaler()
    scaled_numeric = scaler.fit_transform(df[existing_numeric_cols])
    numeric_df = pd.DataFrame(scaled_numeric, columns=existing_numeric_cols)

    # Add missing numeric cols with 0
    for col in numeric_cols:
        if col not in numeric_df.columns:
            numeric_df[col] = 0

    numeric_df = numeric_df[numeric_cols]  # reorder

    # Final dataset in training order
    return pd.concat([numeric_df, city_df], axis=1)

if __name__ == "__main__":
    city = input("Enter Tamil Nadu district: ").strip().title()
    weather_df = get_weather(city)
    processed_data = preprocess_data(weather_df)

    prediction = model.predict(processed_data)
    print("Rain tomorrow" if prediction[0] == 1 else "No rain tomorrow")

Enter Tamil Nadu district:  Chennai


Rain tomorrow
