<a href="https://colab.research.google.com/github/Seb85vickz/CMP7005-Programming-for-Data-Analysis/blob/main/app658.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install streamlit pandas numpy plotly seaborn matplotlib xgboost scikit-learn



In [12]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# -------------------------------------------------------------------------------------------------
# 1. PAGE CONFIGURATION & CUSTOM CSS (Your Design Requirements)
# -------------------------------------------------------------------------------------------------
st.set_page_config(
    page_title="Air Quality Analysis & Prediction",
    page_icon="üå´Ô∏è",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS for Light Maroon Background, Sea Blue Nav, and Maroon Accents
st.markdown("""
    <style>
        /* Main Background - Light Maroon */
        .stApp {
            background-color: #F3E5E8;
        }

        /* Sidebar Background - Light Sea Blue */
        [data-testid="stSidebar"] {
            background-color: #E0FFFF;
            border-right: 2px solid #800000;
        }

        /* Text Colors & Headers - Maroon */
        h1, h2, h3, h4, h5, h6, .stMarkdown, p, label {
            color: #800000 !important;
            font-family: 'Helvetica', sans-serif;
        }

        /* Buttons - Maroon Background, White Text */
        .stButton > button {
            background-color: #800000;
            color: white !important;
            border-radius: 10px;
            border: none;
            font-weight: bold;
        }
        .stButton > button:hover {
            background-color: #A52A2A;
            color: white !important;
        }

        /* Metric Cards */
        [data-testid="stMetricValue"] {
            color: #800000;
        }

        /* Navigation Radio Buttons */
        .stRadio > div {
            background-color: transparent;
        }

        /* Floating effect for sidebar content */
        .css-17lntkn {
            color: #800000;
        }
    </style>
""", unsafe_allow_html=True)

# -------------------------------------------------------------------------------------------------
# 2. DATA PROCESSING FUNCTIONS (Logic from Notebook)
# -------------------------------------------------------------------------------------------------
@st.cache_data
def load_and_clean_data(uploaded_file):
    """
    Loads data, converts dates, removes duplicates, and handles missing values
    specific to the logic found in the notebook (Forward/Back fill per city).
    """
    try:
        df = pd.read_csv(uploaded_file)

        # Date Conversion
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df.sort_values(by=['City', 'Date'], inplace=True)

        # Remove Duplicates
        df.drop_duplicates(inplace=True)

        # Identify Numeric Columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

        # Handling Missing Values (Per City Strategy)
        # 1. Forward Fill within City
        df[numeric_cols] = df.groupby('City')[numeric_cols].ffill()
        # 2. Backward Fill within City
        df[numeric_cols] = df.groupby('City')[numeric_cols].bfill()
        # 3. Fill remaining with 0
        df[numeric_cols] = df[numeric_cols].fillna(0)

        # Feature Engineering (Seasonality)
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month_name()

        def get_season(month):
            if month in [12, 1, 2]: return 'Winter'
            elif month in [3, 4, 5]: return 'Summer'
            elif month in [6, 7, 8]: return 'Monsoon'
            else: return 'Post-Monsoon'

        df['Season'] = df['Date'].dt.month.apply(get_season)

        return df
    except Exception as e:
        st.error(f"Error processing data: {e}")
        return None

def train_model(df):
    """
    Trains an XGBoost Regressor based on the notebook's modeling section.
    """
    features = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']
    target = 'AQI'

    # Filter only columns present in the uploaded dataset
    available_features = [col for col in features if col in df.columns]

    if target not in df.columns:
        return None, None, None, "Target column 'AQI' not found."

    X = df[available_features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=8, n_jobs=-1, random_state=42)
    model.fit(X_train, y_train)

    score = model.score(X_test, y_test)
    return model, available_features, score, None

# -------------------------------------------------------------------------------------------------
# 3. SIDEBAR NAVIGATION
# -------------------------------------------------------------------------------------------------
with st.sidebar:
    st.image("https://cdn-icons-png.flaticon.com/512/2965/2965879.png", width=100)
    st.title("Navigation")

    page = st.radio("Go to", ["Data Overview", "Exploratory Data Analysis", "Modelling & Prediction"])

    st.markdown("---")
    st.markdown("### üìÇ Upload Data")
    uploaded_file = st.file_uploader("Upload 'all_cities_combined.csv' or similar", type=['csv'])

    st.markdown("---")
    st.info("üí° **Tip:** Ensure your CSV has columns like City, Date, PM2.5, AQI, etc.")

# -------------------------------------------------------------------------------------------------
# 4. PAGE: DATA OVERVIEW
# -------------------------------------------------------------------------------------------------
if page == "Data Overview":
    st.title("üìä Data Overview")
    st.markdown("### Dataset Summary & Health Check")

    if uploaded_file is not None:
        df = load_and_clean_data(uploaded_file)

        if df is not None:
            # Metrics Row
            col1, col2, col3, col4 = st.columns(4)
            col1.metric("Total Rows", df.shape[0])
            col2.metric("Total Columns", df.shape[1])
            col3.metric("Cities Covered", df['City'].nunique())
            col4.metric("Date Range", f"{df['Date'].dt.year.min()} - {df['Date'].dt.year.max()}")

            # Data Preview
            st.subheader("Data Preview")
            st.dataframe(df.head(), use_container_width=True)

            # Statistical Summary
            st.subheader("Statistical Statistics")
            st.dataframe(df.describe(), use_container_width=True)

            # Missing Values Visual (Although we cleaned them, good to show 0 if clean)
            st.subheader("Missing Values Heatmap (Post-Cleaning)")
            fig_missing = px.imshow(df.isnull(), title="Missing Data Matrix (Cleaned)", color_continuous_scale='Viridis')
            st.plotly_chart(fig_missing, use_container_width=True)

    else:
        st.warning("‚ö†Ô∏è Please upload a dataset in the sidebar to proceed.")

# -------------------------------------------------------------------------------------------------
# 5. PAGE: EXPLORATORY DATA ANALYSIS (EDA)
# -------------------------------------------------------------------------------------------------
elif page == "Exploratory Data Analysis":
    st.title("üîç Exploratory Data Analysis")

    if uploaded_file is not None:
        df = load_and_clean_data(uploaded_file)

        # Sidebar Filters for EDA
        selected_city = st.selectbox("Select a City to Analyze", df['City'].unique())
        city_df = df[df['City'] == selected_city]

        # 1. Time Series Trend
        st.subheader(f"üìà Pollution Trend: {selected_city}")
        pollutant = st.selectbox("Select Pollutant", ['PM2.5', 'PM10', 'NO2', 'CO', 'SO2', 'O3', 'AQI'])

        fig_line = px.line(city_df, x='Date', y=pollutant, title=f'{pollutant} Levels in {selected_city} Over Time',
                           template='plotly_white', color_discrete_sequence=['#800000'])
        fig_line.update_xaxes(rangeslider_visible=True)
        st.plotly_chart(fig_line, use_container_width=True)

        col_eda_1, col_eda_2 = st.columns(2)

        # 2. Correlation Matrix
        with col_eda_1:
            st.subheader("üîó Correlation Matrix")
            numeric_df = df.select_dtypes(include=[np.number])
            corr_matrix = numeric_df.corr()
            fig_corr = px.imshow(corr_matrix, text_auto=True, aspect="auto",
                                 title="Correlation of Pollutants",
                                 color_continuous_scale='RdBu_r')
            st.plotly_chart(fig_corr, use_container_width=True)

        # 3. Seasonal Analysis
        with col_eda_2:
            st.subheader("üçÇ Seasonal Analysis")
            seasonal_avg = df.groupby('Season')[['PM2.5', 'PM10', 'NO2', 'AQI']].mean().reset_index()
            fig_season = px.bar(seasonal_avg, x='Season', y=pollutant, color='Season',
                                title=f"Average {pollutant} by Season",
                                color_discrete_sequence=px.colors.qualitative.Bold)
            st.plotly_chart(fig_season, use_container_width=True)

        # 4. Box Plot Distribution
        st.subheader("üì¶ Distribution of Key Pollutants")
        fig_box = px.box(df, x='City', y='AQI', color='City', title="AQI Distribution across Cities")
        st.plotly_chart(fig_box, use_container_width=True)

    else:
        st.warning("‚ö†Ô∏è Please upload a dataset in the sidebar to visualize data.")

# -------------------------------------------------------------------------------------------------
# 6. PAGE: MODELLING AND PREDICTION
# -------------------------------------------------------------------------------------------------
elif page == "Modelling & Prediction":
    st.title("ü§ñ Modelling & AQI Prediction")
    st.markdown("### Predict Air Quality Index (AQI) using Machine Learning (XGBoost)")

    if uploaded_file is not None:
        df = load_and_clean_data(uploaded_file)

        if st.button("üöÄ Train Model"):
            with st.spinner("Training XGBoost Regressor... Please wait."):
                model, features, score, error = train_model(df)

                if error:
                    st.error(error)
                else:
                    st.session_state['model'] = model
                    st.session_state['features'] = features
                    st.success(f"Model Trained Successfully! R¬≤ Score: {score:.4f}")

        # Prediction Interface
        if 'model' in st.session_state:
            st.markdown("### üéõÔ∏è Enter Pollutant Values")

            # Create input fields dynamically based on features
            input_data = {}
            cols = st.columns(3)
            features = st.session_state['features']

            for i, feature in enumerate(features):
                with cols[i % 3]:
                    val = st.number_input(f"{feature}", min_value=0.0, value=0.0)
                    input_data[feature] = val

            if st.button("üîÆ Predict AQI"):
                input_df = pd.DataFrame([input_data])
                prediction = st.session_state['model'].predict(input_df)[0]

                # Determine AQI Bucket
                bucket = ""
                color = ""
                if prediction <= 50: bucket, color = "Good", "green"
                elif prediction <= 100: bucket, color = "Satisfactory", "lightgreen"
                elif prediction <= 200: bucket, color = "Moderate", "yellow"
                elif prediction <= 300: bucket, color = "Poor", "orange"
                elif prediction <= 400: bucket, color = "Very Poor", "red"
                else: bucket, color = "Severe", "darkred"

                st.markdown(f"""
                    <div style="background-color: {color}; padding: 20px; border-radius: 10px; text-align: center;">
                        <h2 style="color: white; margin:0;">Predicted AQI: {prediction:.2f}</h2>
                        <h3 style="color: white; margin:0;">Category: {bucket}</h3>
                    </div>
                """, unsafe_allow_html=True)

    else:
        st.warning("‚ö†Ô∏è Please upload a dataset in the sidebar to train the model.")

# -------------------------------------------------------------------------------------------------
# Footer
# -------------------------------------------------------------------------------------------------
st.markdown("---")
st.markdown("<div style='text-align: center; color: #800000;'>Designed for CMP7005 Project | Advanced Streamlit Application</div>", unsafe_allow_html=True)

Overwriting app.py


In [8]:
!wget -q -O - ipv4.icanhazip.com

35.185.40.233


In [9]:
!streamlit run app.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.185.40.233:8501[0m
[0m
y
[34m  Stopping...[0m
