In [17]:
import torch
import pandas as pd
import xgboost as xgb

In [18]:
POLLUTION_DATASET = pd.read_csv("/content/drive/MyDrive/city_day.csv")

POLLUTION_DATASET = POLLUTION_DATASET.drop("Unnamed: 15", axis=1)
POLLUTION_DATASET = POLLUTION_DATASET.drop("City", axis=1)
POLLUTION_DATASET = POLLUTION_DATASET.drop("Date", axis=1)
POLLUTION_DATASET.head()

Unnamed: 0,PM2.5,PM10,NO,NO2,Nox,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI
0,27.93,95.68,9.72,4.71,14.19,0.88,0.79,9.36,30.18,3.39,0.04,3.82,93.0
1,34.05,90.68,16.53,9.83,25.86,1.68,0.93,4.27,17.23,5.27,0.89,21.98,96.0
2,38.1,61.67,7.61,3.3,10.9,1.37,0.65,14.74,26.14,7.14,1.94,2.15,88.0
3,40.41,67.91,10.52,3.96,12.79,3.76,0.55,9.97,28.6,4.46,2.02,1.48,71.0
4,27.21,62.04,8.6,7.2,10.64,5.9,0.53,9.63,34.72,3.57,2.08,1.32,69.0


In [None]:
IDEAL_VALS = {"PM2.5": 11, "PM10": 16, "NO": 0, "NO2": 0, "Nox": 0, "NH3": 0, "CO": 0, "SO2": 88, "O3": 33, "Benzene": 5, "Toluene": 2.3, "Xylene": 0, "AQI": 16 }
columns = POLLUTION_DATASET.columns.values.tolist()
#Generate scores for each lever - compare iteratively and generate average score on its basis
def generate_score(row):
    #Iterate over each column except for gender
    print(" =================================== ")
    total_score = 0
    for column in columns:
        if column == "SCORE": continue
        print(row[column], IDEAL_VALS[column])
        column_score = abs(row[column] - IDEAL_VALS[column])
        print(column_score)
        total_score += column_score
    #Average out the score
    row["SCORE"] = total_score / len(columns)
    print('END SCORE: ', row["SCORE"])
    return row

POLLUTION_DATASET = POLLUTION_DATASET.apply(generate_score, axis = "columns")

In [30]:
POLLUTION_DATASET.to_pickle('GEOGRAPHY_AIR_QUALITY_DATASET.pkl')

In [38]:
#Reshape tensors for XGBOOST
import tensorflow as tf
from sklearn.model_selection import train_test_split

x_features = POLLUTION_DATASET.iloc[:, :13]
print(x_features)
y_labels = POLLUTION_DATASET["SCORE"]
train_x, valid_x, train_y, valid_y = train_test_split(x_features, y_labels, random_state = 2, shuffle = True)

train_x = tf.convert_to_tensor(train_x)
train_x = tf.reshape(train_x, [len(train_x), 13])

valid_x = tf.convert_to_tensor(valid_x)
valid_x = tf.reshape(valid_x, [len(valid_x), 13])

train_y = tf.convert_to_tensor(train_y)
train_y = tf.reshape(train_y, [len(train_y), 1])

valid_y = tf.convert_to_tensor(valid_y)
valid_y = tf.reshape(valid_y, [len(valid_y), 1])

      PM2.5   PM10     NO    NO2    Nox    NH3    CO    SO2     O3  Benzene  \
0     27.93  95.68   9.72   4.71  14.19   0.88  0.79   9.36  30.18     3.39   
1     34.05  90.68  16.53   9.83  25.86   1.68  0.93   4.27  17.23     5.27   
2     38.10  61.67   7.61   3.30  10.90   1.37  0.65  14.74  26.14     7.14   
3     40.41  67.91  10.52   3.96  12.79   3.76  0.55   9.97  28.60     4.46   
4     27.21  62.04   8.60   7.20  10.64   5.90  0.53   9.63  34.72     3.57   
...     ...    ...    ...    ...    ...    ...   ...    ...    ...      ...   
8112   7.63  32.27   5.91  23.27  17.19  11.15  0.46   6.87  19.90     1.45   
8113  15.02  50.94   7.68  25.06  19.54  12.47  0.47   8.55  23.30     2.24   
8114  24.38  74.09   3.42  26.06  16.53  11.99  0.52  12.72  30.14     0.74   
8115  22.91  65.73   3.45  29.53  18.33  10.71  0.48   8.42  30.96     0.01   
8116  16.64  49.97   4.05  29.26  18.80  10.03  0.52   9.84  28.30     0.00   

      Toluene  Xylene   AQI  
0        0.04    3.82

In [41]:
#TRAIN GRADIENT BOOSTED TREE
from sklearn.metrics import mean_squared_error, mean_absolute_error

#Convert to Dmatrix
TRAIN_DATA = xgb.DMatrix(train_x, train_y, feature_names = columns[:13])
VALID_DATA = xgb.DMatrix(valid_x, valid_y, feature_names = columns[:13])
#Parameters for boosted tree
XGBOOST_PARAMS = {"objective": "reg:squarederror", "subsample": 0.6,
                  "colsample_bytree" : 0.6, "learning_rate" : 0.1, "max_depth" : 100,
                  "alpha": 20, "n_estimators": 12}
#Train the xgboost model
XGB_SLEEP_MODEL = xgb.train(XGBOOST_PARAMS, TRAIN_DATA, evals = [(TRAIN_DATA, "TRAIN_DATA"), (VALID_DATA, "VALID_DATA")],
                            num_boost_round = 200, early_stopping_rounds = 40)
predictions = XGB_SLEEP_MODEL.predict(VALID_DATA)
error = mean_squared_error(valid_y, predictions)
print('ERROR: ', error)

[0]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
Multiple eval metrics have been passed: 'VALID_DATA-rmse' will be used for early stopping.

Will train until VALID_DATA-rmse hasn't improved in 40 rounds.
[1]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[2]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[3]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[4]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[5]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[6]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[7]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[8]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[9]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[10]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[11]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[12]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[13]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[14]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[15]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[16]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[17]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan
[18]	TRAIN_DATA-rmse:nan	VALID_DATA-rmse:nan


KeyError: ignored