# Install Required Libraries
Install the necessary libraries, including Streamlit and scikit-learn.

In [1]:
# Install Required Libraries
%pip install streamlit scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Import Libraries
Import the necessary libraries, including pandas, numpy, Streamlit, and scikit-learn.

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

# Load and Preprocess Data
Load the weather data and preprocess it, including handling missing values and encoding categorical features.

In [3]:
# Load and Preprocess Data

# Load the weather data
df = pd.read_csv('rajbhavan_combined.csv')

# Drop columns with more than 50% missing values
threshold = len(df) * 0.5
df = df.dropna(thresh=threshold, axis=1)

# Fill missing values
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].fillna(df[column].mode()[0])
    else:
        df[column] = df[column].fillna(df[column].mean())

# Drop duplicates
df = df.drop_duplicates()

# Handle outliers using IQR
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
Q1 = df[numerical_columns].quantile(0.25, numeric_only=True)
Q3 = df[numerical_columns].quantile(0.75, numeric_only=True)
IQR = Q3 - Q1
df = df.loc[~((df[numerical_columns] < (Q1 - 1.5 * IQR)) | 
              (df[numerical_columns] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Normalize numerical columns
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Convert Date & Time into useful features
df['Date & Time'] = pd.to_datetime(df['Date & Time'])
df['Year'] = df['Date & Time'].dt.year
df['Month'] = df['Date & Time'].dt.month
df['Day'] = df['Date & Time'].dt.day
df['Hour'] = df['Date & Time'].dt.hour

# Define categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Convert all values in categorical columns to string type
df[categorical_columns] = df[categorical_columns].astype(str)

# Encode categorical features
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Define target columns
target_columns = ['Cooling Degree Days']

# Split the dataset into train and test sets
X = df.drop(target_columns, axis=1)
Y = df[target_columns]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Drop the 'Date & Time' column
X_train = X_train.drop(columns=['Date & Time'])
X_test = X_test.drop(columns=['Date & Time'])

  df = pd.read_csv('rajbhavan_combined.csv')


# Define Models
Define the models for prediction, including Random Forest, MLP, Decision Tree + Linear Regression, Random Forest + MLP, and Gaussian Naïve + MLP.

In [5]:
# Define Models

# Initialize the models
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
mlp_regressor = MLPClassifier(random_state=42)
decision_tree_regressor = DecisionTreeRegressor(random_state=42)
linear_regressor = LinearRegression()
gaussian_nb = GaussianNB()

# Define the stacking models
from sklearn.ensemble import StackingRegressor, StackingClassifier
from sklearn.linear_model import LogisticRegression

# Decision Tree + Linear Regression
stacking_model_1 = StackingRegressor(
    estimators=[
        ('decision_tree', decision_tree_regressor),
        ('linear_regression', linear_regressor)
    ],
    final_estimator=LinearRegression()
)

# Random Forest + MLP Classifier
stacking_model_2 = StackingClassifier(
    estimators=[
        ('random_forest', RandomForestClassifier(random_state=42)),
        ('mlp', MLPClassifier(random_state=42))
    ],
    final_estimator=LogisticRegression()
)

# Gaussian Naïve Bayes + MLP Classifier
stacking_model_3 = StackingClassifier(
    estimators=[
        ('gaussian_nb', gaussian_nb),
        ('mlp', MLPClassifier(random_state=42))
    ],
    final_estimator=LogisticRegression()
)

# Build Streamlit Interface
Use Streamlit to build the frontend interface where users can select a model, enter input data, and get predictions.

In [8]:
# Build Streamlit Interface

# Define a function to get user input
def get_user_input():
    year = st.number_input('Year', min_value=2023, max_value=2023, value=2023)
    month = st.number_input('Month', min_value=1, max_value=12, value=1)
    day = st.number_input('Day', min_value=1, max_value=31, value=1)
    hour = st.number_input('Hour', min_value=0, max_value=23, value=0)
    temp = st.number_input('Temperature (°C)', min_value=-50.0, max_value=50.0, value=25.0)
    humidity = st.number_input('Humidity (%)', min_value=0.0, max_value=100.0, value=50.0)
    wind_speed = st.number_input('Wind Speed (km/h)', min_value=0.0, max_value=200.0, value=10.0)
    return pd.DataFrame({
        'Year': [year],
        'Month': [month],
        'Day': [day],
        'Hour': [hour],
        'Inside Temp -  C': [temp],
        'Inside Hum - %': [humidity],
        'Wind Speed': [wind_speed],
        'High Inside Temp -  C': [temp],  # Placeholder values
        'Low Inside Temp -  C': [temp],   # Placeholder values
        'High Inside Hum - %': [humidity],  # Placeholder values
        'Low Inside Hum - %': [humidity],   # Placeholder values
        'Inside Dew Point -  C': [temp],  # Placeholder values
        'Inside Heat Index -  C': [temp],  # Placeholder values
        'Barometer - mb': [1012.0],  # Placeholder values
        'High Bar - mb': [1012.0],   # Placeholder values
        'Low Bar - mb': [1012.0],    # Placeholder values
        'Absolute Pressure - mb': [1010.0]  # Placeholder values
    })

# Streamlit app
st.title('Weather Prediction')

# Model selection
model_option = st.selectbox(
    'Select a model',
    ('Random Forest', 'MLP', 'Decision Tree + Linear Regression', 'Random Forest + MLP Classifier', 'Gaussian Naïve Bayes + MLP Classifier')
)

# Get user input
user_input = get_user_input()

# Preprocess user input
user_input[numerical_columns] = scaler.transform(user_input[numerical_columns])

# Make prediction
if model_option == 'Random Forest':
    rf_regressor.fit(X_train, Y_train.values.ravel())
    prediction = rf_regressor.predict(user_input)
elif model_option == 'MLP':
    mlp_regressor.fit(X_train, Y_train.values.ravel())
    prediction = mlp_regressor.predict(user_input)
elif model_option == 'Decision Tree + Linear Regression':
    stacking_model_1.fit(X_train, Y_train.values.ravel())
    prediction = stacking_model_1.predict(user_input)
elif model_option == 'Random Forest + MLP Classifier':
    stacking_model_2.fit(X_train, Y_train.values.ravel())
    prediction = stacking_model_2.predict(user_input)
elif model_option == 'Gaussian Naïve Bayes + MLP Classifier':
    stacking_model_3.fit(X_train, Y_train.values.ravel())
    prediction = stacking_model_3.predict(user_input)

# Display prediction
st.write('Prediction:', prediction)





ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Wind Speed
Feature names seen at fit time, yet now missing:
- Avg Wind Speed - km/h
- Dew Point -  C
- ET - in
- Heat Index -  C
- Heating Degree Days
- ...


# Make Predictions
Use the selected model to make predictions based on user input and display the results on the screen.

In [9]:
# Make Predictions
# Use the selected model to make predictions based on user input and display the results on the screen.

# Streamlit app
st.title('Weather Prediction')

# Model selection
model_option = st.selectbox(
    'Select a model',
    ('Random Forest', 'MLP', 'Decision Tree + Linear Regression', 'Random Forest + MLP Classifier', 'Gaussian Naïve Bayes + MLP Classifier')
)

# Get user input
user_input = get_user_input()

# Preprocess user input
user_input[numerical_columns] = scaler.transform(user_input[numerical_columns])

# Make prediction
if model_option == 'Random Forest':
    rf_regressor.fit(X_train, Y_train.values.ravel())
    prediction = rf_regressor.predict(user_input)
elif model_option == 'MLP':
    mlp_regressor.fit(X_train, Y_train.values.ravel())
    prediction = mlp_regressor.predict(user_input)
elif model_option == 'Decision Tree + Linear Regression':
    stacking_model_1.fit(X_train, Y_train.values.ravel())
    prediction = stacking_model_1.predict(user_input)
elif model_option == 'Random Forest + MLP Classifier':
    stacking_model_2.fit(X_train, Y_train.values.ravel())
    prediction = stacking_model_2.predict(user_input)
elif model_option == 'Gaussian Naïve Bayes + MLP Classifier':
    stacking_model_3.fit(X_train, Y_train.values.ravel())
    prediction = stacking_model_3.predict(user_input)

# Display prediction
st.write('Prediction:', prediction)



ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Wind Speed
Feature names seen at fit time, yet now missing:
- Avg Wind Speed - km/h
- Dew Point -  C
- ET - in
- Heat Index -  C
- Heating Degree Days
- ...
