## Water Quality Projections in Los Angeles
### Nick Reardon
### 2/14/2022

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()


# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
pd.set_option('max_columns', None)

In [2]:
df = pd.read_csv('./LA water quality.csv')

In [3]:
df.shape

(2356, 11)

In [4]:
df.columns

Index(['Policy Area', 'Dataset', 'Variable', 'Year', 'Contaminant Count',
       'Tract', 'Tract Number', 'Neighborhood', 'GEOID', 'Row ID', 'Date'],
      dtype='object')

In [5]:
# Transform text into features

ml_df = pd.get_dummies(df, 'category')

In [6]:
ml_df.shape

(2356, 7310)

In [8]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# Get our datasets and impute missing values
X = ml_df.copy()
y = pd.DataFrame(KNNImputer().fit_transform(X), columns=ml_df.columns)['Contaminant Count']
del X['Contaminant Count']

# Standardize X
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

In [9]:
y.shape

(2356,)

In [10]:
y.head()

0    570.923693
1    555.052464
2    436.468459
3    436.468459
4    581.035658
Name: Contaminant Count, dtype: float64

In [11]:
print(type(X), X.shape)

<class 'pandas.core.frame.DataFrame'> (2356, 7309)


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [13]:
from sklearn.ensemble import ExtraTreesRegressor

et = ExtraTreesRegressor(n_estimators=1000)
model = et.fit(X_train, y_train)

In [14]:
# Previous run: R2=.718
model.score(X_test, y_test)

0.7194761497303439

In [15]:
# Mean Absolute Percentage Error

def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100

In [17]:
predictions = model.predict(X_test)

In [20]:
mape(y_test, predictions)

11.097344796050072

#### A MAPE that low is a very good sign. Our model is performing well. Let's see if NN's can accomplish something better.

In [21]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.0.0


In [39]:
model = keras.Sequential([
    keras.layers.Dense(units=X.shape[-1], activation='relu'),
    keras.layers.Dense(units=X.shape[-1]*1.5, activation='relu'),
    keras.layers.Dense(units=X.shape[-1]*1.5, activation='relu'),
    keras.layers.Dense(units=1, activation='relu')
])

In [45]:
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mae'])

history = model.fit(
                    X_train.values,
                    y_train.values,
                    epochs=15,
                    validation_data=(X_test.values, y_test.values),
                    validation_steps=5
                    )

Train on 1578 samples, validate on 778 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [46]:
nn_predictions = model.predict(X_test.values)

In [47]:
# Let's see the MAPE for our neural network now

mape(y_test, nn_predictions)

84.98307206964917

#### In this case we most likely do not want to go with this neural network. ExtraTree's works fine