In [17]:
import pandas as pd
from configparser import ConfigParser
import psycopg2
import pandas.io.sql as sqlio
from sqlalchemy import create_engine
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
from config import password

param_dic={
    "host": "localhost",
    "database": "Energy_Analysis",
    "user" : "postgres",
    "password" : "AngieDB374!"
}

def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

conn = connect(param_dic)

Connecting to the PostgreSQL database...
Connection successful


In [3]:
query="select * from ml_input"

ml_input_df=sqlio.read_sql_query(query,conn)

ml_input_df.head()

Unnamed: 0,sampno,gender,employment,student,education,drive_freq,transit_freq,tnc_freq,job_type,work_mode,...,autonomous_hhveh,autonomous_rideshare,autonomous_pooled,autonomous_pref,housing,housing_1_8_x,solar,solar_future,income,ev_flag
0,190001,1,4,4,6,3,0,0,,,...,2,2,1,1,2,,2,2,6,0
1,190003,2,4,4,8,1,0,0,,,...,3,1,4,1,1,,2,2,5,0
2,190003,2,4,4,8,1,0,0,,,...,3,1,4,1,1,,2,2,5,0
3,190005,1,4,4,6,2,0,0,,,...,2,2,3,2,1,,2,2,7,0
4,190005,1,4,4,6,2,0,0,,,...,2,2,3,2,1,,2,2,7,0


In [4]:
ml_input_df.select_dtypes(exclude='int64').columns

Index(['drive_freq', 'job_type', 'work_mode', 'work_distance', 'work_days',
       'school_mode', 'school_distance', 'hybrid_experience', 'past_hybrid',
       'phev_experience', 'past_phev', 'bev_experience', 'past_bev',
       'fcv_experience', 'past_fcv', 'charge_spots', 'charge_work',
       'home_parking_1', 'home_parking_2', 'home_parking_3', 'home_parking_4',
       'home_parking_5', 'home_parking_6', 'home_parking_7', 'home_parking_8',
       'home_parking_9', 'home_parking_8_x', 'home_electricity_access',
       'hsa_distance_1_1_x', 'hsa_distance_1_2_x', 'future_purchase',
       'next_purchase', 'purchase_timing', 'mode_freq_1', 'mode_freq_2',
       'mode_freq_3', 'mode_freq_4', 'mode_freq_5', 'mode_freq_6',
       'mode_freq_7', 'mode_freq_8', 'mode_freq_9', 'mode_freq_10',
       'mode_freq_11', 'mode_freq_12', 'autonomous_rideshare', 'housing_1_8_x',
       'solar_future'],
      dtype='object')

In [5]:
# ml_input_df.columns

## Clean Data

## Split Data into Training and Testing

In [6]:
# check the datatypes
ml_input_df.dtypes

sampno            int64
gender            int64
employment        int64
student           int64
education         int64
                  ...  
housing_1_8_x    object
solar             int64
solar_future     object
income            int64
ev_flag           int64
Length: 95, dtype: object

In [7]:
# Generate our categorical variable lists
ml_input_categorical = ml_input_df.dtypes[ml_input_df.dtypes == "object"].index.tolist()
ml_input_categorical

['drive_freq',
 'job_type',
 'work_mode',
 'work_distance',
 'work_days',
 'school_mode',
 'school_distance',
 'hybrid_experience',
 'past_hybrid',
 'phev_experience',
 'past_phev',
 'bev_experience',
 'past_bev',
 'fcv_experience',
 'past_fcv',
 'charge_spots',
 'charge_work',
 'home_parking_1',
 'home_parking_2',
 'home_parking_3',
 'home_parking_4',
 'home_parking_5',
 'home_parking_6',
 'home_parking_7',
 'home_parking_8',
 'home_parking_9',
 'home_parking_8_x',
 'home_electricity_access',
 'hsa_distance_1_1_x',
 'hsa_distance_1_2_x',
 'future_purchase',
 'next_purchase',
 'purchase_timing',
 'mode_freq_1',
 'mode_freq_2',
 'mode_freq_3',
 'mode_freq_4',
 'mode_freq_5',
 'mode_freq_6',
 'mode_freq_7',
 'mode_freq_8',
 'mode_freq_9',
 'mode_freq_10',
 'mode_freq_11',
 'mode_freq_12',
 'autonomous_rideshare',
 'housing_1_8_x',
 'solar_future']

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(ml_input_df[ml_input_categorical]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(ml_input_categorical)
encode_df.head()

Unnamed: 0,drive_freq_,drive_freq_1,drive_freq_2,drive_freq_3,drive_freq_4,job_type_,job_type_1,job_type_2,job_type_3,job_type_4,...,housing_1_8_x_attached single family house,housing_1_8_x_casa,housing_1_8_x_extra room,housing_1_8_x_family home,housing_1_8_x_granny flat,housing_1_8_x_guest house,housing_1_8_x_manufactured sfr,solar_future_,solar_future_1,solar_future_2
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
# Merge one-hot encoded features and drop the originals
ml_input_final = ml_input_df.merge(encode_df,left_index=True,right_index=True).drop(ml_input_categorical,1)
ml_input_final

Unnamed: 0,sampno,gender,employment,student,education,transit_freq,tnc_freq,age_grp,california,county,...,housing_1_8_x_attached single family house,housing_1_8_x_casa,housing_1_8_x_extra room,housing_1_8_x_family home,housing_1_8_x_granny flat,housing_1_8_x_guest house,housing_1_8_x_manufactured sfr,solar_future_,solar_future_1,solar_future_2
0,190001,1,4,4,6,0,0,4,1,38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,190003,2,4,4,8,0,0,4,1,30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,190003,2,4,4,8,0,0,4,1,30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,190005,1,4,4,6,0,0,4,1,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,190005,1,4,4,6,0,0,4,1,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17835,190803,2,1,4,8,0,0,2,1,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17836,190803,2,1,4,8,0,0,2,1,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17837,190803,2,1,4,8,0,0,2,1,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17838,190806,1,1,4,2,0,0,2,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Build our Model

### Predict EV Ownership Based on Personal Demographics

In [10]:
# Split our preprocessed data into our features and target arrays
X=ml_input_final.drop(columns='ev_flag', axis='columns')
y=ml_input_final['ev_flag']

In [11]:
print(X.shape)

(17840, 558)


In [12]:
X[:10]

Unnamed: 0,sampno,gender,employment,student,education,transit_freq,tnc_freq,age_grp,california,county,...,housing_1_8_x_attached single family house,housing_1_8_x_casa,housing_1_8_x_extra room,housing_1_8_x_family home,housing_1_8_x_granny flat,housing_1_8_x_guest house,housing_1_8_x_manufactured sfr,solar_future_,solar_future_1,solar_future_2
0,190001,1,4,4,6,0,0,4,1,38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,190003,2,4,4,8,0,0,4,1,30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,190003,2,4,4,8,0,0,4,1,30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,190005,1,4,4,6,0,0,4,1,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,190005,1,4,4,6,0,0,4,1,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,190005,2,4,4,6,0,0,4,1,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,190005,2,4,4,6,0,0,4,1,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,190004,2,4,1,4,0,2,2,1,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,190004,1,1,4,3,0,0,2,1,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,190004,2,4,4,1,1,0,2,1,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [14]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(13380, 558)
(4460, 558)
(13380,)
(4460,)


In [15]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Decision Tree Model

#### Fit the Decision Tree Model

In [18]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

#### Make Predictions with the Decision Tree Model

In [19]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

#### Evaluate the Model

In [20]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3989,137
Actual 1,252,82


In [22]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.912780269058296

In [23]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3989,137
Actual 1,252,82


Accuracy Score : 0.912780269058296
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.97      0.95      4126
           1       0.37      0.25      0.30       334

    accuracy                           0.91      4460
   macro avg       0.66      0.61      0.63      4460
weighted avg       0.90      0.91      0.90      4460



### Random Forest Model

#### Fit the Random Forest Model

In [24]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [25]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

#### Make Predictions Random Forest Model

In [26]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

#### Evaluate the Accuracy

In [35]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3943,183
Actual 1,232,102


In [36]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.9069506726457399

In [37]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3943,183
Actual 1,232,102


Accuracy Score : 0.9069506726457399
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      4126
           1       0.36      0.31      0.33       334

    accuracy                           0.91      4460
   macro avg       0.65      0.63      0.64      4460
weighted avg       0.90      0.91      0.90      4460



#### Rank the Importance of Features

In [31]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([1.53038117e-02, 1.72252743e-02, 7.42199064e-03, 2.36154189e-03,
       1.69484124e-02, 4.23789397e-03, 4.13196616e-03, 3.04377731e-03,
       0.00000000e+00, 8.23779616e-03, 5.18656090e-03, 5.50303062e-03,
       1.73441184e-02, 1.84045626e-03, 2.47239453e-03, 2.11237406e-03,
       7.33457258e-03, 6.01498706e-03, 3.47570330e-03, 4.27046589e-03,
       2.99576485e-03, 4.02536694e-03, 3.70083733e-03, 2.85384719e-03,
       3.35050246e-03, 3.22813729e-03, 4.40078018e-03, 3.79016252e-03,
       4.45058738e-03, 4.13768058e-03, 4.26959348e-03, 4.66825808e-03,
       4.59178838e-03, 4.61301434e-03, 4.53493253e-03, 4.84824107e-03,
       4.63488514e-03, 1.95909160e-03, 5.18083730e-03, 4.59794150e-03,
       4.15108631e-03, 5.09784548e-03, 4.55072675e-03, 3.95311227e-03,
       3.28957705e-03, 7.30760402e-03, 1.82667676e-03, 5.66227373e-03,
       5.04313074e-03, 1.22117449e-03, 5.74860442e-04, 3.36599174e-03,
       4.34735918e-03, 2.61512226e-03, 2.42017627e-03, 7.40214976e-04,
      

In [32]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.05648149779611265, 'charge_spots_ '),
 (0.04596634881376079, 'home_parking_5_ '),
 (0.03653628188820696, 'home_parking_2_ '),
 (0.03193194202717949, 'home_parking_9_ '),
 (0.03067229241696548, 'home_parking_8_0'),
 (0.030004856253298, 'home_parking_4_ '),
 (0.029923921714037557, 'home_parking_8_ '),
 (0.025302345115768534, 'home_parking_1_ '),
 (0.022445058053730788, 'home_parking_3_ '),
 (0.02128348417424149, 'home_parking_3_0'),
 (0.020323069327305907, 'home_parking_7_0'),
 (0.019383589990842134, 'home_parking_6_ '),
 (0.018847381364838257, 'home_parking_9_0'),
 (0.018147463040477485, 'home_parking_7_ '),
 (0.01734411841351623, 'num_hh_vehicles'),
 (0.017225274272103637, 'gender'),
 (0.016948412441148055, 'education'),
 (0.01626847487809799, 'bev_experience_ '),
 (0.015843093067939233, 'home_electricity_access_ '),
 (0.015303811684654155, 'sampno'),
 (0.012626774953283734, 'phev_experience_2'),
 (0.012559784346439227, 'home_parking_2_0'),
 (0.010050059327967945, 'home_parking_6_0