In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
import pandas as pd

CSV_FILE_PATH = '/Users/phongporter/Documents/GITHUB/cos30049/FaR_ver2/backend/data/air_quality_health_2.csv'
def train_linear_regression(dataset, country_name, pollutant, exposure_value):

    Q1_exposure = dataset['Exposure Mean'].quantile(0.25)
    Q3_exposure = dataset['Exposure Mean'].quantile(0.75)
    IQR_exposure = Q3_exposure - Q1_exposure
    lower_bound_exposure = Q1_exposure - 1.5 * IQR_exposure
    upper_bound_exposure = Q3_exposure + 1.5 * IQR_exposure

    Q1_burden = dataset['Burden Mean'].quantile(0.25)
    Q3_burden = dataset['Burden Mean'].quantile(0.75)
    IQR_burden = Q3_burden - Q1_burden
    lower_bound_burden = Q1_burden - 1.5 * IQR_burden
    upper_bound_burden = Q3_burden + 1.5 * IQR_burden

    dataset_cleaned = dataset[
        (dataset['Exposure Mean'] >= lower_bound_exposure) & 
        (dataset['Exposure Mean'] <= upper_bound_exposure) & 
        (dataset['Burden Mean'] >= lower_bound_burden) & 
        (dataset['Burden Mean'] <= upper_bound_burden)
    ]
    filtered_df = dataset_cleaned[(dataset_cleaned['Cause_Name'] == 'All causes')]
    filtered_df1 = filtered_df[filtered_df['Country'] == country_name]    
    filtered_df2 = filtered_df1[filtered_df1['Pollutant'] == pollutant]    

    if len(filtered_df2) == 0:
        return f"Filtered Data Points : {len(filtered_df2)}"
    
    X = filtered_df2[['Exposure Mean']].values
    y = filtered_df2['Burden Mean'].values

    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

    # Linear Regression Model
    model1 = LinearRegression()
    model1.fit(X_scaled, y_scaled)
    burden_prediction1 = model1.predict(scaler_X.transform([[exposure_value]]))[0]
    burden_prediction_rescaled1 = scaler_y.inverse_transform([burden_prediction1])[0, 0]

    # Polynomial Regression Model (degree 4)
    poly = PolynomialFeatures(degree=4)
    X_poly = poly.fit_transform(X_scaled)
    model2 = LinearRegression()
    model2.fit(X_poly, y_scaled)
    exposure_value_poly = poly.transform(scaler_X.transform([[exposure_value]]))
    burden_prediction2 = model2.predict(exposure_value_poly)[0]
    burden_prediction_rescaled2 = scaler_y.inverse_transform([burden_prediction2])[0, 0] 

    prediction = max(burden_prediction_rescaled1, burden_prediction_rescaled2)
    
    return {
        "Predicted Burden 1": float(burden_prediction_rescaled1),
        "Predicted Burden 2": float(burden_prediction_rescaled2),
        "Predicted Burden Mean": float(prediction),
        "Filtered Data Points": len(filtered_df2)
    }


In [12]:
df = pd.read_csv(CSV_FILE_PATH)
train_linear_regression(df, "Viet Nam", "no2", 13.2)

{'Predicted Burden 1': 3.3042144317399025,
 'Predicted Burden 2': 3.15851324109461,
 'Predicted Burden Mean': 3.3042144317399025,
 'Filtered Data Points': 31}

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd

CSV_FILE_PATH = '/Users/phongporter/Documents/GITHUB/cos30049/FaR_ver2/backend/data/air_quality_health_2.csv'

def train_linear_regression(dataset, country_name, pollutant, exposure_value):
    # Outlier removal for 'Exposure Mean'
    Q1_exposure = dataset['Exposure Mean'].quantile(0.25)
    Q3_exposure = dataset['Exposure Mean'].quantile(0.75)
    IQR_exposure = Q3_exposure - Q1_exposure
    lower_bound_exposure = Q1_exposure - 1.5 * IQR_exposure
    upper_bound_exposure = Q3_exposure + 1.5 * IQR_exposure

    # Outlier removal for 'Burden Mean'
    Q1_burden = dataset['Burden Mean'].quantile(0.25)
    Q3_burden = dataset['Burden Mean'].quantile(0.75)
    IQR_burden = Q3_burden - Q1_burden
    lower_bound_burden = Q1_burden - 1.5 * IQR_burden
    upper_bound_burden = Q3_burden + 1.5 * IQR_burden

    # Clean dataset by removing outliers
    dataset_cleaned = dataset[
        (dataset['Exposure Mean'] >= lower_bound_exposure) & 
        (dataset['Exposure Mean'] <= upper_bound_exposure) & 
        (dataset['Burden Mean'] >= lower_bound_burden) & 
        (dataset['Burden Mean'] <= upper_bound_burden)
    ]

    # Filter dataset based on specific criteria
    filtered_df = dataset_cleaned[
        (dataset_cleaned['Cause_Name'] == 'All causes') &
        (dataset_cleaned['Country'] == country_name) &
        (dataset_cleaned['Pollutant'] == pollutant)
    ]

    if len(filtered_df) == 0:
        return f"Filtered Data Points : {len(filtered_df)}"

    # Feature and target variables
    X = filtered_df[['Exposure Mean']].values
    y = filtered_df['Burden Mean'].values

    # Scale features and target
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_scaled, test_size=0.2, random_state=42
    )

    # ----- Linear Regression Model -----
    model_linear = LinearRegression()
    model_linear.fit(X_train, y_train)
    y_pred_linear = model_linear.predict(X_test)

    # Evaluation Metrics for Linear Regression
    mse_linear = mean_squared_error(y_test, y_pred_linear)
    r2_linear = r2_score(y_test, y_pred_linear)

    # ----- Polynomial Regression Model (degree 4) -----
    poly = PolynomialFeatures(degree=4)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    model_poly = LinearRegression()
    model_poly.fit(X_train_poly, y_train)
    y_pred_poly = model_poly.predict(X_test_poly)

    # Evaluation Metrics for Polynomial Regression
    mse_poly = mean_squared_error(y_test, y_pred_poly)
    r2_poly = r2_score(y_test, y_pred_poly)

    # ----- Make Prediction for the Given Exposure Value -----
    # For Linear Regression
    burden_prediction_linear = model_linear.predict(scaler_X.transform([[exposure_value]]))[0]
    burden_prediction_rescaled_linear = scaler_y.inverse_transform([burden_prediction_linear])[0, 0]

    # For Polynomial Regression
    exposure_value_scaled = scaler_X.transform([[exposure_value]])
    exposure_value_poly = poly.transform(exposure_value_scaled)
    burden_prediction_poly = model_poly.predict(exposure_value_poly)[0]
    burden_prediction_rescaled_poly = scaler_y.inverse_transform([burden_prediction_poly])[0, 0]

    # Determine which model is better
    if mse_linear < mse_poly and r2_linear > r2_poly:
        better_model = "Linear Regression"
    elif mse_poly < mse_linear and r2_poly > r2_linear:
        better_model = "Polynomial Regression"
    else:
        better_model = "Both models have trade-offs"

    prediction_mean = max(burden_prediction_rescaled_linear, burden_prediction_rescaled_poly)

    return {
        "Predicted Burden Linear": float(burden_prediction_rescaled_linear),
        "Predicted Burden Polynomial": float(burden_prediction_rescaled_poly),
        "Predicted Burden Mean": float(prediction_mean),
        "Mean Squared Error Linear": float(mse_linear),
        "R2 Score Linear": float(r2_linear),
        "Mean Squared Error Polynomial": float(mse_poly),
        "R2 Score Polynomial": float(r2_poly),
        "Better Model": better_model,
        "Filtered Data Points": len(filtered_df)
    }


In [41]:
# Load your dataset
dataset = pd.read_csv(CSV_FILE_PATH)

# Define parameters
country_name = "Russian Federation"
pollutant = "pm25"
exposure_value = 13.2

# Train models and get evaluation metrics
results = train_linear_regression(dataset, country_name, pollutant, exposure_value)

# Display the results
print(results)


Filtered Data Points : 0


In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd

CSV_FILE_PATH = '/Users/phongporter/Documents/GITHUB/cos30049/FaR_ver2/backend/data/air_quality_health_2.csv'

def train_linear_regression(dataset, country_name, pollutant, exposure_value):
    # Outlier removal for 'Exposure Mean'
    Q1_exposure = dataset['Exposure Mean'].quantile(0.25)
    Q3_exposure = dataset['Exposure Mean'].quantile(0.75)
    IQR_exposure = Q3_exposure - Q1_exposure
    lower_bound_exposure = Q1_exposure - 1.5 * IQR_exposure
    upper_bound_exposure = Q3_exposure + 1.5 * IQR_exposure

    # Outlier removal for 'Burden Mean'
    Q1_burden = dataset['Burden Mean'].quantile(0.25)
    Q3_burden = dataset['Burden Mean'].quantile(0.75)
    IQR_burden = Q3_burden - Q1_burden
    lower_bound_burden = Q1_burden - 1.5 * IQR_burden
    upper_bound_burden = Q3_burden + 1.5 * IQR_burden

    # Clean dataset by removing outliers
    dataset_cleaned = dataset[
        (dataset['Exposure Mean'] >= lower_bound_exposure) & 
        (dataset['Exposure Mean'] <= upper_bound_exposure) & 
        (dataset['Burden Mean'] >= lower_bound_burden) & 
        (dataset['Burden Mean'] <= upper_bound_burden)
    ]

    # Filter dataset based on specific criteria
    filtered_df = dataset_cleaned[
        (dataset_cleaned['Cause_Name'] == 'All causes') &
        (dataset_cleaned['Country'] == country_name) &
        (dataset_cleaned['Pollutant'] == pollutant)
    ]

    if len(filtered_df) == 0:
        return f"Filtered Data Points : {len(filtered_df)}"

    # Feature and target variables
    X = filtered_df[['Exposure Mean']].values
    y = filtered_df['Burden Mean'].values

    # Scale features and target
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_scaled, test_size=0.2, random_state=42
    )

    # ----- Linear Regression Model -----
    model_linear = LinearRegression()
    model_linear.fit(X_train, y_train)
    y_pred_linear = model_linear.predict(X_test)

    # Evaluation Metrics for Linear Regression
    mse_linear = mean_squared_error(y_test, y_pred_linear)
    r2_linear = r2_score(y_test, y_pred_linear)

    # ----- Polynomial Regression Model (degree 4) -----
    poly = PolynomialFeatures(degree=4)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    model_poly = LinearRegression()
    model_poly.fit(X_train_poly, y_train)
    y_pred_poly = model_poly.predict(X_test_poly)

    # Evaluation Metrics for Polynomial Regression
    mse_poly = mean_squared_error(y_test, y_pred_poly)
    r2_poly = r2_score(y_test, y_pred_poly)

    # ----- Make Prediction for the Given Exposure Value -----
    # For Linear Regression
    burden_prediction_linear = model_linear.predict(scaler_X.transform([[exposure_value]]))[0]
    burden_prediction_rescaled_linear = scaler_y.inverse_transform([burden_prediction_linear])[0, 0]

    # For Polynomial Regression
    exposure_value_scaled = scaler_X.transform([[exposure_value]])
    exposure_value_poly = poly.transform(exposure_value_scaled)
    burden_prediction_poly = model_poly.predict(exposure_value_poly)[0]
    burden_prediction_rescaled_poly = scaler_y.inverse_transform([burden_prediction_poly])[0, 0]

    # Determine which model is better
    if mse_linear < mse_poly and r2_linear > r2_poly:
        better_model = "Linear Regression"
        prediction = burden_prediction_rescaled_linear
    elif mse_poly < mse_linear and r2_poly > r2_linear:
        better_model = "Polynomial Regression"
        prediction = burden_prediction_rescaled_poly
    else:
        return {
            "Message": "Both models have similar performance; no significant difference in results."
        }

    return {
        "Better Model": better_model,
        "Predicted Burden": float(prediction),
        "Filtered Data Points": len(filtered_df)
    }


In [38]:
# Load your dataset
dataset = pd.read_csv(CSV_FILE_PATH)

# Define parameters
country_name = "Russian Federation"
pollutant = "pm25"
exposure_value = 25.9

# Train models and get evaluation metrics
results = train_linear_regression(dataset, country_name, pollutant, exposure_value)

# Display the results
print(results)


Filtered Data Points : 0


In [None]:
def train_linear_regression_filtered_first(dataset, country_name, pollutant, exposure_value):
    # Filter dataset based on country and pollutant first
    filtered_df = dataset[
        (dataset['Country'] == country_name) &
        (dataset['Pollutant'] == pollutant)
    ]

    # Check if there are any data points left after initial filtering
    if len(filtered_df) == 0:
        return f"Filtered Data Points : {len(filtered_df)}"
    
    # Outlier removal for 'Exposure Mean'
    Q1_exposure = dataset['Exposure Mean'].quantile(0.25)
    Q3_exposure = dataset['Exposure Mean'].quantile(0.75)
    IQR_exposure = Q3_exposure - Q1_exposure
    lower_bound_exposure = Q1_exposure - 1.5 * IQR_exposure
    upper_bound_exposure = Q3_exposure + 1.5 * IQR_exposure

    # Outlier removal for 'Burden Mean'
    Q1_burden = dataset['Burden Mean'].quantile(0.25)
    Q3_burden = dataset['Burden Mean'].quantile(0.75)
    IQR_burden = Q3_burden - Q1_burden
    lower_bound_burden = Q1_burden - 1.5 * IQR_burden
    upper_bound_burden = Q3_burden + 1.5 * IQR_burden

    # Clean dataset by removing outliers after initial filtering
    dataset_cleaned = filtered_df[
        (filtered_df['Exposure Mean'] >= lower_bound_exposure) & 
        (filtered_df['Exposure Mean'] <= upper_bound_exposure) & 
        (filtered_df['Burden Mean'] >= lower_bound_burden) & 
        (filtered_df['Burden Mean'] <= upper_bound_burden)
    ]

    # Check if data is available after removing outliers
    if len(dataset_cleaned) == 0:
        return f"Filtered Data Points : {len(dataset_cleaned)}"

    # Feature and target variables
    X = dataset_cleaned[['Exposure Mean']].values
    y = dataset_cleaned['Burden Mean'].values

    # Scale features and target
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_scaled, test_size=0.2, random_state=42
    )

    # ----- Linear Regression Model -----
    model_linear = LinearRegression()
    model_linear.fit(X_train, y_train)
    y_pred_linear = model_linear.predict(X_test)

    # Evaluation Metrics for Linear Regression
    mse_linear = mean_squared_error(y_test, y_pred_linear)
    r2_linear = r2_score(y_test, y_pred_linear)

    # ----- Polynomial Regression Model (degree 4) -----
    poly = PolynomialFeatures(degree=4)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    model_poly = LinearRegression()
    model_poly.fit(X_train_poly, y_train)
    y_pred_poly = model_poly.predict(X_test_poly)

    # Evaluation Metrics for Polynomial Regression
    mse_poly = mean_squared_error(y_test, y_pred_poly)
    r2_poly = r2_score(y_test, y_pred_poly)

    # ----- Make Prediction for the Given Exposure Value -----
    # For Linear Regression
    burden_prediction_linear = model_linear.predict(scaler_X.transform([[exposure_value]]))[0]
    burden_prediction_rescaled_linear = scaler_y.inverse_transform([burden_prediction_linear])[0, 0]

    # For Polynomial Regression
    exposure_value_scaled = scaler_X.transform([[exposure_value]])
    exposure_value_poly = poly.transform(exposure_value_scaled)
    burden_prediction_poly = model_poly.predict(exposure_value_poly)[0]
    burden_prediction_rescaled_poly = scaler_y.inverse_transform([burden_prediction_poly])[0, 0]

    # Determine which model is better
    if mse_linear < mse_poly and r2_linear > r2_poly:
        better_model = "Linear Regression"
        prediction = burden_prediction_rescaled_linear
    elif mse_poly < mse_linear and r2_poly > r2_linear:
        better_model = "Polynomial Regression"
        prediction = burden_prediction_rescaled_poly
    else:
        return {
            "Message": "Both models have similar performance; no significant difference in results."
        }

    return {
        "Better Model": better_model,
        "Predicted Burden": float(prediction),
        "Filtered Data Points": len(dataset_cleaned)
    }

In [45]:
# Testing with RUS data for pollutant 'pm25' and exposure value 25.5 with initial filtering
result_filtered_first = train_linear_regression_filtered_first(dataset, 'Russian Federation', 'pm25', 25.5)
result_filtered_first

{'Better Model': 'Linear Regression',
 'Predicted Burden': 127.87585191871547,
 'Filtered Data Points': 145}

In [47]:
# Testing with RUS data for pollutant 'pm25' and exposure value 25.5 with initial filtering
result_filtered_first = train_linear_regression_filtered_first(dataset, 'Viet Nam', 'no2', 12.3)
result_filtered_first

{'Better Model': 'Polynomial Regression',
 'Predicted Burden': 2.9586221460632105,
 'Filtered Data Points': 62}

In [33]:
df1 = df[df['Country'] == 'Russian Federation']
df2 = df1[df1['Cause_Name'] == 'All causes']
df3 = df2[df2['Pollutant'] == 'pm25']
df3

Unnamed: 0.1,Unnamed: 0,Country,ISO3,Year,Pollutant,Exposure Mean,Units,Cause_Name,Burden Mean,Measure,Metric
45137,45137,Russian Federation,RUS,1990,pm25,25.9,µg/m3,All causes,2806.644047,DALYs (Disability-Adjusted Life Years),Rate
45146,45146,Russian Federation,RUS,1991,pm25,25.5,µg/m3,All causes,2762.350876,DALYs (Disability-Adjusted Life Years),Rate
45151,45151,Russian Federation,RUS,1992,pm25,25.0,µg/m3,All causes,2881.860079,DALYs (Disability-Adjusted Life Years),Rate
45159,45159,Russian Federation,RUS,1993,pm25,24.5,µg/m3,All causes,3355.88092,DALYs (Disability-Adjusted Life Years),Rate
45167,45167,Russian Federation,RUS,1994,pm25,24.0,µg/m3,All causes,3582.484863,DALYs (Disability-Adjusted Life Years),Rate
45171,45171,Russian Federation,RUS,1995,pm25,23.5,µg/m3,All causes,3333.626676,DALYs (Disability-Adjusted Life Years),Rate
45181,45181,Russian Federation,RUS,1996,pm25,23.0,µg/m3,All causes,3045.893917,DALYs (Disability-Adjusted Life Years),Rate
45187,45187,Russian Federation,RUS,1997,pm25,22.5,µg/m3,All causes,2807.051,DALYs (Disability-Adjusted Life Years),Rate
45192,45192,Russian Federation,RUS,1998,pm25,22.0,µg/m3,All causes,2698.452049,DALYs (Disability-Adjusted Life Years),Rate
45203,45203,Russian Federation,RUS,1999,pm25,21.4,µg/m3,All causes,2836.689666,DALYs (Disability-Adjusted Life Years),Rate
