# ![](https://ga-dash.s3.amazonaws.com/production/assets/logo-9f88ae6c9c3871690e33280fcf557f33.png)  Capstone Project:  "Does Wealth = Health?"
*Predicting health based on indicators of financial wellbeing*


## this notebook covers modeling. . .

In [148]:
# Import basic Python libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# This will allow us to avoid a FutureWarning when plotting.
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()


## load the dataset of (40) selected features ...

In [149]:
# load data into dataframe from .csv file exported in the previous notebook. . .

#Adjust the path to where you have saved the data
df_feat = pd.read_csv("../data/NFWBS_40_FEATURES_export.csv")
print(df_feat.shape)
df_feat.head()

(6341, 41)


Unnamed: 0,PUF_ID,HEALTH,SWB_1,SWB_2,SWB_3,FWBscore,FWB1_1,FWB1_2,FWB1_4,FWB2_2,...,HHEDUC,PPINCIMP,PPREG4,PPREG9,fpl,agecat,generation,PPGENDER,PPMARIT,PPETHM
0,10350,1,5,5,6,55,3,3,3,3,...,4,7,4,8,3,8,1,1,3,1
1,7740,1,6,6,6,51,2,2,3,2,...,2,6,2,3,3,3,3,1,3,1
2,13699,1,4,3,4,49,3,3,3,3,...,3,6,4,9,3,3,3,1,3,2
3,7375,1,4,4,4,49,3,3,3,3,...,2,7,2,4,3,2,4,1,1,3
4,10910,0,5,7,5,67,5,1,1,5,...,4,7,2,3,3,2,4,1,1,1


In [150]:
# set PUF_ID as index
df_feat.set_index('PUF_ID', inplace=True)
df_feat.head()

Unnamed: 0_level_0,HEALTH,SWB_1,SWB_2,SWB_3,FWBscore,FWB1_1,FWB1_2,FWB1_4,FWB2_2,FSscore,...,HHEDUC,PPINCIMP,PPREG4,PPREG9,fpl,agecat,generation,PPGENDER,PPMARIT,PPETHM
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10350,1,5,5,6,55,3,3,3,3,44,...,4,7,4,8,3,8,1,1,3,1
7740,1,6,6,6,51,2,2,3,2,43,...,2,6,2,3,3,3,3,1,3,1
13699,1,4,3,4,49,3,3,3,3,42,...,3,6,4,9,3,3,3,1,3,2
7375,1,4,4,4,49,3,3,3,3,42,...,2,7,2,4,3,2,4,1,1,3
10910,0,5,7,5,67,5,1,1,5,57,...,4,7,2,3,3,2,4,1,1,1


## Start modeling (with no preprocessing or rebalancing)...

In [151]:
# Import Python classifcation model libraries

# for Logistic Regression (classification)
from sklearn.linear_model import LogisticRegression

# for Knn (classification)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# for Train/Test K-Fold Cross-Validation scoring... 
from sklearn.model_selection import train_test_split, KFold, cross_val_score

from sklearn.preprocessing import StandardScaler

# compute classification accuracy
from sklearn import metrics

# Confusion matrix measures results, so test data is used
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso


## Logistic Regression

Standardization is not necessary when variables are already on the same scale

https://towardsdatascience.com/when-to-standardize-your-data-in-4-minutes-f9282190707e
Standardization is not required for Logistic Regression and Tree based algorithms 
such as Decision Tree, Random forest and gradient boosting, because they are not
sensitive to the magnitude of variables.



Yes, per sklearn (Logistic Regression) documentation: "Note that regularization is applied by default."
penalty : str, ‘l1’, ‘l2’, ‘elasticnet’ or ‘none’, optional (default=’l2’)


### logistic regression overfitting
Models are overfit when the test score is worse than the train score.


### In order to address overfitting with Logistic Regression, we can try to:
### 1. adjust regularization like Lasso and Ridge (as seen in the models above)
### 2. reduce the number of variables / level of complexity
### 3. increase the amount of data (if possible)


In [None]:
# 1st method:

In [152]:
# Separate input features and target
y = df_feat.HEALTH
X = df_feat.drop('HEALTH', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('------------------------')
print('TRAIN TEST SPLIT (80/20)')
print('------------------------')
print('X_train data: ', X_train.shape)
print('X_test data: ', X_test.shape)

# Train model
lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)
 
# get predicted target from model
# Predict on training set
lr_pred = lr.predict(X_test)

print('------------------------')
print('TEST SCORES')
print('------------------------')

# checking accuracy
print('logreg Test accuracy score: ', accuracy_score(y_test, lr_pred))

# checking f1
print('logreg Test f1 score: ', f1_score(y_test, lr_pred))

# checking recall
print('logreg Test recall score: ', recall_score(y_test, lr_pred))


# ----------------------------------------
print('------------------------')
print('CONFUSION MATRIX')
print('------------------------')
# generate confusion matrix
cm = confusion_matrix(y_test, lr_pred)
tn, fp, fn, tp = cm.ravel()

print('TOTAL negative (0: "optimal health") = ', tn+fn)
print('true negative = ', tn)
print('false negative = ', fn)

print('TOTAL positive (1: "sub-optimal health") = ', fp+tp)
print('false positive = ', fp)
print('true positive = ', tp)

# Checking unique values
predictions = pd.DataFrame(lr_pred)
predictions[0].value_counts()


print(cm)


------------------------
TRAIN TEST SPLIT (80/20)
------------------------
X_train data:  (5072, 39)
X_test data:  (1269, 39)
------------------------
TEST SCORES
------------------------
logreg Test accuracy score:  0.6832151300236406
logreg Test f1 score:  0.6649999999999999
logreg Test recall score:  0.6456310679611651
------------------------
CONFUSION MATRIX
------------------------
TOTAL negative (0: "optimal health") =  687
true negative =  468
false negative =  219
TOTAL positive (1: "sub-optimal health") =  582
false positive =  183
true positive =  399
[[468 183]
 [219 399]]


#### Try to oversample the minority class (1: "sub-optimal" health)

In [154]:
from sklearn.utils import resample

# Separate input features and target
y = df_feat.HEALTH
X = df_feat.drop('HEALTH', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('------------------------')
print('TRAIN TEST SPLIT (80/20)')
print('------------------------')
print('X_train data: ', X_train.shape)
print('X_test data: ', X_test.shape)

# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
optimal = X[X.HEALTH==0]
sub_optimal = X[X.HEALTH==1]

# upsample minority
sub_upsampled = resample(sub_optimal,
                          replace=True, # sample with replacement
                          n_samples=len(optimal), # match number in majority class
                          random_state=42) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([optimal, sub_upsampled])

# check new class counts
upsampled.HEALTH.value_counts()



------------------------
TRAIN TEST SPLIT (80/20)
------------------------
X_train data:  (5072, 39)
X_test data:  (1269, 39)


1    2616
0    2616
Name: HEALTH, dtype: int64

#### Retry Logistic Regression (using upsampled data). . .

In [156]:
# Separate input features and target
y_train = upsampled.HEALTH
X_train = upsampled.drop('HEALTH', axis=1)

# # setting up testing and training sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('------------------------')
# print('TRAIN TEST SPLIT (80/20)')
print('------------------------')
print('X_train data: ', X_train.shape)
print('X_test data: ', X_test.shape)

# Train model
upsampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)
 
# get predicted target from model
# Predict on training set
upsampled_pred = upsampled.predict(X_test)

print('------------------------')
print('TEST SCORES')
print('------------------------')

# checking accuracy
print('upsampled logreg Test accuracy score: ', accuracy_score(y_test, upsampled_pred))

# checking f1
print('upsampled logreg Test f1 score: ', f1_score(y_test, upsampled_pred))

# checking recall
print('upsampled logreg Test recall score: ', recall_score(y_test, upsampled_pred))


# ----------------------------------------
print('------------------------')
print('CONFUSION MATRIX')
print('------------------------')
# generate confusion matrix
cm = confusion_matrix(y_test, upsampled_pred)
tn, fp, fn, tp = cm.ravel()

print('TOTAL negative (0: "optimal health") = ', tn+fn)
print('true negative = ', tn)
print('false negative = ', fn)

print('TOTAL positive (1: "sub-optimal health") = ', fp+tp)
print('false positive = ', fp)
print('true positive = ', tp)

# Checking unique values
predictions = pd.DataFrame(upsampled_pred)
predictions[0].value_counts()


print(cm)



------------------------
------------------------
X_train data:  (5232, 39)
X_test data:  (1269, 39)
------------------------
TEST SCORES
------------------------
upsampled logreg Test accuracy score:  0.6769109535066982
upsampled logreg Test f1 score:  0.6617161716171618
upsampled logreg Test recall score:  0.6488673139158576
------------------------
CONFUSION MATRIX
------------------------
TOTAL negative (0: "optimal health") =  675
true negative =  458
false negative =  217
TOTAL positive (1: "sub-optimal health") =  594
false positive =  193
true positive =  401
[[458 193]
 [217 401]]


#### Try to undersample the majority class (0: "optimal" health)

In [157]:
# still using our separated classes fraud and not_fraud from above

# downsample majority
optimal_downsampled = resample(optimal,
                                replace = False, # sample without replacement
                                n_samples = len(sub_optimal), # match minority n
                                random_state = 42) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([optimal_downsampled, sub_optimal])

# checking counts
downsampled.HEALTH.value_counts()

1    2456
0    2456
Name: HEALTH, dtype: int64

#### Retry Logistic Regression (using downsampled data). . .

In [158]:
# Separate input features and target
y_train = downsampled.HEALTH
X_train = downsampled.drop('HEALTH', axis=1)

# # setting up testing and training sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('------------------------')
# print('TRAIN TEST SPLIT (80/20)')
print('------------------------')
print('X_train data: ', X_train.shape)
print('X_test data: ', X_test.shape)

# Train model
undersampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)
 
# get predicted target from model
# Predict on training set
undersampled_pred = undersampled.predict(X_test)

print('------------------------')
print('TEST SCORES')
print('------------------------')

# checking accuracy
print('undersampled logreg Test accuracy score: ', accuracy_score(y_test, undersampled_pred))

# checking f1
print('undersampled logreg Test f1 score: ', f1_score(y_test, undersampled_pred))

# checking recall
print('undersampled logreg Test recall score: ', recall_score(y_test, undersampled_pred))


# ----------------------------------------
print('------------------------')
print('CONFUSION MATRIX')
print('------------------------')
# generate confusion matrix
cm = confusion_matrix(y_test, undersampled_pred)
tn, fp, fn, tp = cm.ravel()

print('TOTAL negative (0: "optimal health") = ', tn+fn)
print('true negative = ', tn)
print('false negative = ', fn)

print('TOTAL positive (1: "sub-optimal health") = ', fp+tp)
print('false positive = ', fp)
print('true positive = ', tp)

# Checking unique values
predictions = pd.DataFrame(undersampled_pred)
predictions[0].value_counts()


print(cm)




------------------------
------------------------
X_train data:  (4912, 39)
X_test data:  (1269, 39)
------------------------
TEST SCORES
------------------------
undersampled logreg Test accuracy score:  0.6729708431836091
undersampled logreg Test f1 score:  0.660670482420278
undersampled logreg Test recall score:  0.6537216828478964
------------------------
CONFUSION MATRIX
------------------------
TOTAL negative (0: "optimal health") =  664
true negative =  450
false negative =  214
TOTAL positive (1: "sub-optimal health") =  605
false positive =  201
true positive =  404
[[450 201]
 [214 404]]


In [None]:
# 2nd method:

In [103]:
# Separate input features and target
y = df_feat.HEALTH
X = df_feat.drop('HEALTH', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train data: ', X_train.shape)
print('X_test data: ', X_test.shape)


X_train data:  (5072, 39)
X_test data:  (1269, 39)


In [104]:
# standardize train and test data
# ss = StandardScaler()
# X_train_sc = ss.fit_transform(X_train)
# X_test_sc = ss.transform(X_test)

In [107]:
# instantiate logistic regression model
lr = LogisticRegression()

# fit model
lr.fit(X_train, y_train)
print('logreg train score: ', lr.score(X_train, y_train))
print('logreg test score: ', lr.score(X_test, y_test))


logreg train score:  0.699723974763407
logreg test score:  0.6832151300236406


In [None]:
# 3rd method:

In [138]:
# Separate input features and target
y = df_feat.HEALTH
X = df_feat.drop('HEALTH', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('------------------------')
print('TRAIN TEST SPLIT (80/20)')
print('------------------------')
print('X_train data: ', X_train.shape)
print('X_test data: ', X_test.shape)



# get predicted target from model
# Predict on training set
L1_pred = logreg_L1.predict(X_test)
L10_pred = logreg_L10.predict(X_test)


print('------------------------')
print('logreg_L1 TEST SCORES')
print('------------------------')
# checking accuracy
print('logreg_L1 Test accuracy score: ', accuracy_score(y_test, L1_pred))
# checking f1
print('logreg_L1 Test f1 score: ', f1_score(y_test, L1_pred))
# checking recall
print('logreg_L1 Test recall score: ', recall_score(y_test, L1_pred))
print('------------------------')
print('logreg_L10 TEST SCORES')
print('------------------------')
# checking accuracy
print('logreg_L10 Test accuracy score: ', accuracy_score(y_test, L1_pred))
# checking f1
print('logreg_L10 Test f1 score: ', f1_score(y_test, L1_pred))
# checking recall
print('logreg_L10 Test recall score: ', recall_score(y_test, L1_pred))



# instantiate four separate models, 
# one with LASSO and  𝛼=1, one with LASSO and  𝛼=10,  one with Ridge and  𝛼=1, 
# and one with Ridge and  𝛼=10
# (Hint: Be careful with how you specify  𝛼  in your model!)

# "regularization"  (adding a penalty):
# Lasso = l1, Ridge = l2

logreg_L1 = LogisticRegression(penalty = 'l1', C = 1.0)
logreg_L10 = LogisticRegression(penalty = 'l1', C = 0.10)
logreg_R1 = LogisticRegression(penalty = 'l2', C = 1.0)
logreg_R10 = LogisticRegression(penalty = 'l2', C = 0.10)

# fit each model
logreg_L1.fit(X_train, y_train)
print('logreg L1 train score: ', logreg_L1.score(X_train, y_train))
print('logreg L1 test score: ', logreg_L1.score(X_test, y_test))

logreg_L10.fit(X_train, y_train)
print('logreg L10 train score: ', logreg_L10.score(X_train, y_train))
print('logreg L10 test score: ', logreg_L10.score(X_test, y_test))

logreg_R1.fit(X_train, y_train)
print('logreg R1 train score: ', logreg_R1.score(X_train, y_train))
print('logreg R1 test score: ', logreg_R1.score(X_test, y_test))

logreg_R10.fit(X_train, y_train)
print('logreg R10 train score: ', logreg_R10.score(X_train, y_train))
print('logreg R10 test score: ', logreg_R10.score(X_test, y_test))

# Using accuracy as your metric, evaluate all eight of your models on both the training and testing sets.


------------------------
TRAIN TEST SPLIT (80/20)
------------------------
X_train data:  (5072, 39)
X_test data:  (1269, 39)
------------------------
logreg_L1 TEST SCORES
------------------------
logreg_L1 Test accuracy score:  0.6840031520882585
logreg_L1 Test f1 score:  0.6661115736885929
logreg_L1 Test recall score:  0.6472491909385113
------------------------
logreg_L10 TEST SCORES
------------------------
logreg_L10 Test accuracy score:  0.6840031520882585
logreg_L10 Test f1 score:  0.6661115736885929
logreg_L10 Test recall score:  0.6472491909385113
logreg L1 train score:  0.7003154574132492
logreg L1 test score:  0.6840031520882585
logreg L10 train score:  0.698935331230284
logreg L10 test score:  0.6769109535066982
logreg R1 train score:  0.699723974763407
logreg R1 test score:  0.6832151300236406
logreg R10 train score:  0.701301261829653
logreg R10 test score:  0.681639085894405


In [139]:
# instantiate models, changing value for C (𝛼=2)
# "regularization"  (adding a penalty):
# Lasso = l1, Ridge = l2

logreg_L2 = LogisticRegression(penalty = 'l1', C = 2.0)
logreg_R2 = LogisticRegression(penalty = 'l2', C = 2.0)

# fit each model
logreg_L2.fit(X_train, y_train)
print('logreg L2 train score: ', logreg_L2.score(X_train, y_train))
print('logreg L2 test score: ', logreg_L2.score(X_test, y_test))

logreg_R2.fit(X_train, y_train)
print('logreg R2 train score: ', logreg_R2.score(X_train, y_train))
print('logreg R2 test score: ', logreg_R2.score(X_test, y_test))


# The scores are the same as when 𝛼=1. 
# It looks like the balancing effect with regularization is maintaining the same results.



logreg L2 train score:  0.7009069400630915
logreg L2 test score:  0.6840031520882585
logreg R2 train score:  0.6991324921135647
logreg R2 test score:  0.6840031520882585


## Random Forest

In [140]:
# Separate input features and target
y = df_feat.HEALTH
X = df_feat.drop('HEALTH', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train data: ', X_train.shape)
print('X_test data: ', X_test.shape)

# train model
rfc = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

# predict on test set
rfc_pred = rfc.predict(X_test)

# checking accuracy
print('Test accuracy score: ', accuracy_score(y_test, rfc_pred))

# checking f1
print('Test f1 score: ', f1_score(y_test, rfc_pred))

# checking recall
print('Test recall score: ', recall_score(y_test, rfc_pred))

# Checking unique values
predictions = pd.DataFrame(rfc_pred)
predictions[0].value_counts()


X_train data:  (5072, 39)
X_test data:  (1269, 39)
Test accuracy score:  0.6493301812450749
Test f1 score:  0.6160483175150991
Test recall score:  0.5776699029126213


0    728
1    541
Name: 0, dtype: int64

In [141]:
# standardize train and test data
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [142]:

# train model
rfc = RandomForestClassifier(n_estimators=10).fit(X_train_sc, y_train)

# predict on test set
rfc_pred = rfc.predict(X_test_sc)

# checking accuracy
print('Test accuracy score: ', accuracy_score(y_test, rfc_pred))

# checking f1
print('Test f1 score: ', f1_score(y_test, rfc_pred))

# checking recall
print('Test recall score: ', recall_score(y_test, rfc_pred))

# Checking unique values
predictions = pd.DataFrame(rfc_pred)
predictions[0].value_counts()



Test accuracy score:  0.6761229314420804
Test f1 score:  0.6313901345291479
Test recall score:  0.56957928802589


0    772
1    497
Name: 0, dtype: int64

## KNN

KNN is a distance-based model (it calculates the distance between neighbors) 
which is highly sensitive to the magnitude of features.
Housing data with features like square footage should be scaled ("standardized")

https://towardsdatascience.com/when-to-standardize-your-data-in-4-minutes-f9282190707e
Standardization makes all variables to contribute equally to the similarity measures

k should be odd; it is a hyperparameter we choose and the best number is found through experimentation

As k increases, variance decreases (and bias increases).
The test score is better than the train score for both KNN models with higher k values (k=15 and k=25)

### knn overfitting
Models are overfit when the test score is worse than the train score.


### In order to address overfitting with KNN, we can try to:
### 1. increase the value of k (as seen in the models above)
### 2. reduce the number of variables / level of complexity
### 3. use a different model (as seen above, Linear Regression performs better...)



In [143]:
# Separate input features and target
y = df_feat.HEALTH
X = df_feat.drop('HEALTH', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train data: ', X_train.shape)
print('X_test data: ', X_test.shape)

# Train model  
# training a KNN classifier
knn = KNeighborsClassifier(n_neighbors = 71).fit(X_train, y_train)
  
# accuracy on X_test
accuracy = knn.score(X_test, y_test)
print('Test accuracy score: ', accuracy)

# creating a confusion matrix
knn_predictions = knn.predict(X_test) 
cm = confusion_matrix(y_test, knn_predictions)
# print(cm)

X_train data:  (5072, 39)
X_test data:  (1269, 39)
Test accuracy score:  0.6784869976359338


In [145]:
# Separate input features and target
y = df_feat.HEALTH
X = df_feat.drop('HEALTH', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train data: ', X_train.shape)
print('X_test data: ', X_test.shape)

# standardize train and test data
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)


# Train model  
# training a KNN classifier
knn = KNeighborsClassifier(n_neighbors = 71).fit(X_train_sc, y_train)
  
# accuracy on X_test
accuracy = knn.score(X_test_sc, y_test)
print('Test accuracy score: ', accuracy)

# creating a confusion matrix
knn_predictions = knn.predict(X_test_sc) 
cm = confusion_matrix(y_test, knn_predictions)
# print(cm)

X_train data:  (5072, 39)
X_test data:  (1269, 39)
Test accuracy score:  0.6784869976359338
[[482 169]
 [239 379]]


In [None]:
# instantiate four separate models, 
# one with  𝑘=3, one with  𝑘=5, one with  𝑘=15, and one with  𝑘=25

knn_k3 = KNeighborsClassifier(n_neighbors=3)
knn_k5 = KNeighborsClassifier(n_neighbors=5)
knn_k15 = KNeighborsClassifier(n_neighbors=15)
knn_k25 = KNeighborsClassifier(n_neighbors=25)

# fit each model
knn_k3.fit(X_train, y_train)
print('KNN 3 train score: ', knn_k3.score(X_train, y_train))
print('KNN 3 test score: ', knn_k3.score(X_test, y_test))



In [147]:
# generate confusion matrix
confusion_matrix(y_test, y_pred)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print('true negative = ', tn)
print('false positive = ', fp)
print('false negative = ', fn)
print('true positive = ', tp)



NameError: name 'y_pred' is not defined