In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 0. load (and sort) data frame
seed = 42
main = pd.read_csv('data/df.csv').sample(frac=1, random_state= seed)

# 1. remove instances with imputed target variable
with open('data/timesteps_missing_data.pkl', 'rb') as pickle_file:
    timestep_missing_data = pickle.load(pickle_file)


# exclude values 
main_no_impute = main.copy()

for id in timestep_missing_data.keys():  
    for timestep in timestep_missing_data.get(id):
        #exclude row which match indexing (id and timestep of imputed value)
        main_no_impute = main_no_impute[~((main_no_impute['id'] == id) & (main_no_impute['time_step'] == timestep))]

print(f'Imputed Instances excluded: {main.shape[0] - main_no_impute.shape[0]}')

# 2. add column of weighted mean pollution
# Function to calculate weighted mean pollution based on similarity in distance to the city center
def weighted_mean_pollution(row):

    # Filter for other stations at the same timestamp
    same_time = main[main['time_step'] == row['time_step']]
    # Exclude the current station
    other_stations = same_time[same_time['id'] != row['id']]
    # similarity weights based on the inverse of the absolute difference in distances
    weights = 1 / (1 + np.abs(other_stations['distance_city'] - row['distance_city']))
    # Calculate the weighted mean pollution
    weighted_mean = np.average(other_stations['NO2'], weights=weights)
    return weighted_mean


# add weighted mean pollution for each time step
main_no_impute['weighted_mean_pollution'] = main_no_impute.apply(weighted_mean_pollution, axis=1)    

# 3. separate test from training station based on id 
# TEST STATIONS: mc117, mc018, mc145
main = main_no_impute.copy()
test_data_mc117 = main[main['id'] == 'mc117'].sort_values('time_step')
test_data_mc018 = main[main['id'] == 'mc018'].sort_values('time_step')
test_data_mc145 = main[main['id'] == 'mc145'].sort_values('time_step')

# 4. exclude test stations and recompute weighted mean based on training stations only
train_df = main[~main['id'].isin(['mc117', 'mc018', 'mc145'])]
train_df['weighted_mean_pollution'] = train_df.apply(weighted_mean_pollution, axis=1) 

# 5. model training: fit all training data with selected features and hyperparameter
features = ['prec_mm', 'temp', 'wind_degree', 'wind_speed',
       'free_wind', 'prop_intercept_200', 'GVI_25', 'tvi_200', 'prop_main_', 
       'nearest_in', 'pop_500', 'lai_factor', 'weighted_mean_pollution']


parameter = {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5}

       
X_train = train_df[features]
y_train = train_df['NO2']

model = RandomForestRegressor(**parameter, n_jobs=-1) # initiate model with defined parameter
model.fit(X_train, y_train) 


# 6. prediction & model performance
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

test_stations = ['mc018', 'mc145', 'mc117']

data_frames = {
    'mc117': test_data_mc117,
    'mc018': test_data_mc018,
    'mc145': test_data_mc145,
}

for test_station in test_stations:
    current_df = data_frames[test_station].copy().reset_index()
    X_test = current_df[features]
    y_test = current_df['NO2']
    
    y_pred = model.predict(X_test) # evaluate model
    data_frames[test_station]['y_pred'] = y_pred
    print(test_station)
    print(f"r2: {r2_score(y_test, y_pred)}")
    print(f"MSE: {mean_squared_error(y_test, y_pred, squared=True)} \n")
    y_pred_df = pd.DataFrame(y_pred, columns= ['y_pred'])
    current_df = pd.concat([y_pred_df, current_df], axis = 1).drop(['index', 'Unnamed: 0'], axis = 1)
    current_df

# export predictions
combined_df = pd.concat(data_frames.values(), ignore_index=True)
combined_df.to_csv('results/datasets/results_rf.csv', index_label= False)

Imputed Instances excluded: 722


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['weighted_mean_pollution'] = train_df.apply(weighted_mean_pollution, axis=1)


mc018
r2: 0.7465317499471764
MSE: 32.237126676679196 

mc145
r2: 0.44847564434200815
MSE: 18.44051265365198 

mc117
r2: 0.6191664816749738
MSE: 66.43857690514186 



