In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
import pandas as pd

CSV_FILE_PATH = '/Users/phongporter/Documents/GITHUB/cos30049/FaR_ver2/backend/data/air_quality_health_2.csv'
def train_linear_regression(dataset, country_name, pollutant, exposure_value):

    Q1_exposure = dataset['Exposure Mean'].quantile(0.25)
    Q3_exposure = dataset['Exposure Mean'].quantile(0.75)
    IQR_exposure = Q3_exposure - Q1_exposure
    lower_bound_exposure = Q1_exposure - 1.5 * IQR_exposure
    upper_bound_exposure = Q3_exposure + 1.5 * IQR_exposure

    Q1_burden = dataset['Burden Mean'].quantile(0.25)
    Q3_burden = dataset['Burden Mean'].quantile(0.75)
    IQR_burden = Q3_burden - Q1_burden
    lower_bound_burden = Q1_burden - 1.5 * IQR_burden
    upper_bound_burden = Q3_burden + 1.5 * IQR_burden

    dataset_cleaned = dataset[
        (dataset['Exposure Mean'] >= lower_bound_exposure) & 
        (dataset['Exposure Mean'] <= upper_bound_exposure) & 
        (dataset['Burden Mean'] >= lower_bound_burden) & 
        (dataset['Burden Mean'] <= upper_bound_burden)
    ]
    filtered_df = dataset_cleaned[(dataset_cleaned['Cause_Name'] == 'All causes')]
    filtered_df1 = filtered_df[filtered_df['Country'] == country_name]    
    filtered_df2 = filtered_df1[filtered_df1['Pollutant'] == pollutant]    

    if len(filtered_df2) == 0:
        return f"Filtered Data Points : {len(filtered_df2)}"
    
    X = filtered_df2[['Exposure Mean']].values
    y = filtered_df2['Burden Mean'].values

    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

    # Linear Regression Model
    model1 = LinearRegression()
    model1.fit(X_scaled, y_scaled)
    burden_prediction1 = model1.predict(scaler_X.transform([[exposure_value]]))[0]
    burden_prediction_rescaled1 = scaler_y.inverse_transform([burden_prediction1])[0, 0]

    # Polynomial Regression Model (degree 4)
    poly = PolynomialFeatures(degree=4)
    X_poly = poly.fit_transform(X_scaled)
    model2 = LinearRegression()
    model2.fit(X_poly, y_scaled)
    exposure_value_poly = poly.transform(scaler_X.transform([[exposure_value]]))
    burden_prediction2 = model2.predict(exposure_value_poly)[0]
    burden_prediction_rescaled2 = scaler_y.inverse_transform([burden_prediction2])[0, 0] 

    prediction = max(burden_prediction_rescaled1, burden_prediction_rescaled2)
    
    return {
        "Predicted Burden 1": float(burden_prediction_rescaled1),
        "Predicted Burden 2": float(burden_prediction_rescaled2),
        "Predicted Burden Mean": float(prediction),
        "Filtered Data Points": len(filtered_df2)
    }


In [3]:
df = pd.read_csv(CSV_FILE_PATH)
train_linear_regression(df, "Viet Nam", "no2", 2.3)

{'Predicted Burden 1': 0.08502207458563701,
 'Predicted Burden 2': -7.836101319745463,
 'Predicted Burden Mean': 0.08502207458563701,
 'Filtered Data Points': 31}

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd

CSV_FILE_PATH = '/Users/phongporter/Documents/GITHUB/cos30049/FaR_ver2/backend/data/air_quality_health_2.csv'

def train_linear_regression(dataset, country_name, pollutant, exposure_value):
    # Outlier removal for 'Exposure Mean'
    Q1_exposure = dataset['Exposure Mean'].quantile(0.25)
    Q3_exposure = dataset['Exposure Mean'].quantile(0.75)
    IQR_exposure = Q3_exposure - Q1_exposure
    lower_bound_exposure = Q1_exposure - 1.5 * IQR_exposure
    upper_bound_exposure = Q3_exposure + 1.5 * IQR_exposure

    # Outlier removal for 'Burden Mean'
    Q1_burden = dataset['Burden Mean'].quantile(0.25)
    Q3_burden = dataset['Burden Mean'].quantile(0.75)
    IQR_burden = Q3_burden - Q1_burden
    lower_bound_burden = Q1_burden - 1.5 * IQR_burden
    upper_bound_burden = Q3_burden + 1.5 * IQR_burden

    # Clean dataset by removing outliers
    dataset_cleaned = dataset[
        (dataset['Exposure Mean'] >= lower_bound_exposure) & 
        (dataset['Exposure Mean'] <= upper_bound_exposure) & 
        (dataset['Burden Mean'] >= lower_bound_burden) & 
        (dataset['Burden Mean'] <= upper_bound_burden)
    ]

    # Filter dataset based on specific criteria
    filtered_df = dataset_cleaned[
        (dataset_cleaned['Cause_Name'] == 'All causes') &
        (dataset_cleaned['Country'] == country_name) &
        (dataset_cleaned['Pollutant'] == pollutant)
    ]

    if len(filtered_df) == 0:
        return f"Filtered Data Points : {len(filtered_df)}"

    # Feature and target variables
    X = filtered_df[['Exposure Mean']].values
    y = filtered_df['Burden Mean'].values

    # Scale features and target
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_scaled, test_size=0.2, random_state=42
    )

    # ----- Linear Regression Model -----
    model_linear = LinearRegression()
    model_linear.fit(X_train, y_train)
    y_pred_linear = model_linear.predict(X_test)

    # Evaluation Metrics for Linear Regression
    mse_linear = mean_squared_error(y_test, y_pred_linear)
    r2_linear = r2_score(y_test, y_pred_linear)

    # ----- Polynomial Regression Model (degree 1) -----
    poly = PolynomialFeatures(degree=1)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    model_poly = LinearRegression()
    model_poly.fit(X_train_poly, y_train)
    y_pred_poly = model_poly.predict(X_test_poly)

    # Evaluation Metrics for Polynomial Regression
    mse_poly = mean_squared_error(y_test, y_pred_poly)
    r2_poly = r2_score(y_test, y_pred_poly)

    # ----- Make Prediction for the Given Exposure Value -----
    # For Linear Regression
    burden_prediction_linear = model_linear.predict(scaler_X.transform([[exposure_value]]))[0]
    burden_prediction_rescaled_linear = scaler_y.inverse_transform([burden_prediction_linear])[0, 0]

    # For Polynomial Regression
    exposure_value_scaled = scaler_X.transform([[exposure_value]])
    exposure_value_poly = poly.transform(exposure_value_scaled)
    burden_prediction_poly = model_poly.predict(exposure_value_poly)[0]
    burden_prediction_rescaled_poly = scaler_y.inverse_transform([burden_prediction_poly])[0, 0]

    # Determine which model is better
    if mse_linear < mse_poly and r2_linear > r2_poly:
        better_model = "Linear Regression"
    elif mse_poly < mse_linear and r2_poly > r2_linear:
        better_model = "Polynomial Regression"
    else:
        better_model = "Both models have trade-offs"

    prediction_mean = max(burden_prediction_rescaled_linear, burden_prediction_rescaled_poly)

    return {
        "Predicted Burden Linear": float(burden_prediction_rescaled_linear),
        "Predicted Burden Polynomial": float(burden_prediction_rescaled_poly),
        "Predicted Burden Mean": float(prediction_mean),
        "Mean Squared Error Linear": float(mse_linear),
        "R2 Score Linear": float(r2_linear),
        "Mean Squared Error Polynomial": float(mse_poly),
        "R2 Score Polynomial": float(r2_poly),
        "Better Model": better_model,
        "Filtered Data Points": len(filtered_df)
    }


In [19]:
# Load your dataset
dataset = pd.read_csv(CSV_FILE_PATH)

# Define parameters
country_name = "Viet Nam"
pollutant = "no2"
exposure_value = 0.3

# Train models and get evaluation metrics
results = train_linear_regression(dataset, country_name, pollutant, exposure_value)

# Display the results
print(results)


{'Predicted Burden Linear': -0.5868769750262884, 'Predicted Burden Polynomial': -0.5868769750262877, 'Predicted Burden Mean': -0.5868769750262877, 'Mean Squared Error Linear': 0.005351492384951429, 'R2 Score Linear': 0.9258783628790088, 'Mean Squared Error Polynomial': 0.00535149238495143, 'R2 Score Polynomial': 0.9258783628790087, 'Better Model': 'Linear Regression', 'Filtered Data Points': 31}


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd

CSV_FILE_PATH = '/Users/phongporter/Documents/GITHUB/cos30049/FaR_ver2/backend/data/air_quality_health_2.csv'

def train_linear_regression(dataset, country_name, pollutant, exposure_value):
    # Outlier removal for 'Exposure Mean'
    Q1_exposure = dataset['Exposure Mean'].quantile(0.25)
    Q3_exposure = dataset['Exposure Mean'].quantile(0.75)
    IQR_exposure = Q3_exposure - Q1_exposure
    lower_bound_exposure = Q1_exposure - 1.5 * IQR_exposure
    upper_bound_exposure = Q3_exposure + 1.5 * IQR_exposure

    # Outlier removal for 'Burden Mean'
    Q1_burden = dataset['Burden Mean'].quantile(0.25)
    Q3_burden = dataset['Burden Mean'].quantile(0.75)
    IQR_burden = Q3_burden - Q1_burden
    lower_bound_burden = Q1_burden - 1.5 * IQR_burden
    upper_bound_burden = Q3_burden + 1.5 * IQR_burden

    # Clean dataset by removing outliers
    dataset_cleaned = dataset[
        (dataset['Exposure Mean'] >= lower_bound_exposure) & 
        (dataset['Exposure Mean'] <= upper_bound_exposure) & 
        (dataset['Burden Mean'] >= lower_bound_burden) & 
        (dataset['Burden Mean'] <= upper_bound_burden)
    ]

    # Filter dataset based on specific criteria
    filtered_df = dataset_cleaned[
        (dataset_cleaned['Cause_Name'] == 'All causes') &
        (dataset_cleaned['Country'] == country_name) &
        (dataset_cleaned['Pollutant'] == pollutant)
    ]

    if len(filtered_df) == 0:
        return f"Filtered Data Points : {len(filtered_df)}"

    # Feature and target variables
    X = filtered_df[['Exposure Mean']].values
    y = filtered_df['Burden Mean'].values

    # Scale features and target
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_scaled, test_size=0.2, random_state=42
    )

    # ----- Linear Regression Model -----
    model_linear = LinearRegression()
    model_linear.fit(X_train, y_train)
    y_pred_linear = model_linear.predict(X_test)

    # Evaluation Metrics for Linear Regression
    mse_linear = mean_squared_error(y_test, y_pred_linear)
    r2_linear = r2_score(y_test, y_pred_linear)

    # ----- Polynomial Regression Model (degree 4) -----
    poly = PolynomialFeatures(degree=4)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    model_poly = LinearRegression()
    model_poly.fit(X_train_poly, y_train)
    y_pred_poly = model_poly.predict(X_test_poly)

    # Evaluation Metrics for Polynomial Regression
    mse_poly = mean_squared_error(y_test, y_pred_poly)
    r2_poly = r2_score(y_test, y_pred_poly)

    # ----- Make Prediction for the Given Exposure Value -----
    # For Linear Regression
    burden_prediction_linear = model_linear.predict(scaler_X.transform([[exposure_value]]))[0]
    burden_prediction_rescaled_linear = scaler_y.inverse_transform([burden_prediction_linear])[0, 0]

    # For Polynomial Regression
    exposure_value_scaled = scaler_X.transform([[exposure_value]])
    exposure_value_poly = poly.transform(exposure_value_scaled)
    burden_prediction_poly = model_poly.predict(exposure_value_poly)[0]
    burden_prediction_rescaled_poly = scaler_y.inverse_transform([burden_prediction_poly])[0, 0]

    # Determine which model is better
    if mse_linear < mse_poly and r2_linear > r2_poly:
        better_model = "Linear Regression"
        prediction = burden_prediction_rescaled_linear
    elif mse_poly < mse_linear and r2_poly > r2_linear:
        better_model = "Polynomial Regression"
        prediction = burden_prediction_rescaled_poly
    else:
        return {
            "Message": "Both models have similar performance; no significant difference in results."
        }

    return {
        "Better Model": better_model,
        "Predicted Burden": float(prediction),
        "Filtered Data Points": len(filtered_df)
    }


In [9]:
# Load your dataset
dataset = pd.read_csv(CSV_FILE_PATH)

# Define parameters
country_name = "Viet Nam"
pollutant = "no2"
exposure_value = 2.3

# Train models and get evaluation metrics
results = train_linear_regression(dataset, country_name, pollutant, exposure_value)

# Display the results
print(results)


{'Better Model': 'Polynomial Regression', 'Predicted Burden': -6.453835497476689, 'Filtered Data Points': 31}


In [13]:
df1 = df[df['Country'] == 'Viet Nam']
df2 = df1[df1['Cause_Name'] == 'All causes']
df3 = df2[df2['Pollutant'] == 'no2']
df3

Unnamed: 0.1,Unnamed: 0,Country,ISO3,Year,Pollutant,Exposure Mean,Units,Cause_Name,Burden Mean,Measure,Metric
12339,12339,Viet Nam,VNM,1990,no2,7.5,µg/m3,All causes,1.429577,DALYs (Disability-Adjusted Life Years),Rate
12341,12341,Viet Nam,VNM,1991,no2,7.64,µg/m3,All causes,1.457137,DALYs (Disability-Adjusted Life Years),Rate
12343,12343,Viet Nam,VNM,1992,no2,7.78,µg/m3,All causes,1.486966,DALYs (Disability-Adjusted Life Years),Rate
12345,12345,Viet Nam,VNM,1993,no2,7.93,µg/m3,All causes,1.525534,DALYs (Disability-Adjusted Life Years),Rate
12346,12346,Viet Nam,VNM,1994,no2,8.08,µg/m3,All causes,1.57298,DALYs (Disability-Adjusted Life Years),Rate
12349,12349,Viet Nam,VNM,1995,no2,8.23,µg/m3,All causes,1.617861,DALYs (Disability-Adjusted Life Years),Rate
12351,12351,Viet Nam,VNM,1996,no2,8.38,µg/m3,All causes,1.70582,DALYs (Disability-Adjusted Life Years),Rate
12352,12352,Viet Nam,VNM,1997,no2,8.55,µg/m3,All causes,1.846707,DALYs (Disability-Adjusted Life Years),Rate
12354,12354,Viet Nam,VNM,1998,no2,8.71,µg/m3,All causes,2.005714,DALYs (Disability-Adjusted Life Years),Rate
12357,12357,Viet Nam,VNM,1999,no2,8.88,µg/m3,All causes,2.151654,DALYs (Disability-Adjusted Life Years),Rate
