In [1]:
!pip install psycopg2

Collecting psycopg2
  Downloading psycopg2-2.8.6-cp37-cp37m-win_amd64.whl (1.1 MB)
Installing collected packages: psycopg2
Successfully installed psycopg2-2.8.6


In [1]:
import pandas as pd
from configparser import ConfigParser
import psycopg2
import pandas.io.sql as sqlio
from sqlalchemy import create_engine
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# CONNECT TO AWS DATABASE

In [2]:
##Note: uncomment this if you are using a config.py file, and change the "password"
# from config import password
password = "postgres"

In [3]:

param_dic={
    "host": "energy-analysis.cfcrgd6zjkoj.us-east-2.rds.amazonaws.com",
    "database": "Energy_Analysis",
    "user" : "postgres",
    "password" : password
}

def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

conn = connect(param_dic)

Connecting to the PostgreSQL database...
Connection successful


In [4]:
query="select * from ml_input"

ml_input_df=sqlio.read_sql_query(query,conn)

ml_input_df.head()

Unnamed: 0,sampno,gender,employment,student,education,drive_freq,transit_freq,tnc_freq,job_type,work_mode,...,autonomous_hhveh,autonomous_rideshare,autonomous_pooled,autonomous_pref,housing,housing_1_8_x,solar,solar_future,income,ev_flag
0,190001,1,4,4,6,3,0,0,,,...,2,2,1,1,2,,2,2,6,0
1,190003,2,4,4,8,1,0,0,,,...,3,1,4,1,1,,2,2,5,0
2,190003,2,4,4,8,1,0,0,,,...,3,1,4,1,1,,2,2,5,0
3,190005,1,4,4,6,2,0,0,,,...,2,2,3,2,1,,2,2,7,0
4,190005,1,4,4,6,2,0,0,,,...,2,2,3,2,1,,2,2,7,0


In [5]:
ml_input_df.select_dtypes(exclude='int64').columns

Index(['drive_freq', 'job_type', 'work_mode', 'work_distance', 'work_days',
       'school_mode', 'school_distance', 'hybrid_experience', 'past_hybrid',
       'phev_experience', 'past_phev', 'bev_experience', 'past_bev',
       'fcv_experience', 'past_fcv', 'charge_spots', 'charge_work',
       'home_parking_1', 'home_parking_2', 'home_parking_3', 'home_parking_4',
       'home_parking_5', 'home_parking_6', 'home_parking_7', 'home_parking_8',
       'home_parking_9', 'home_parking_8_x', 'home_electricity_access',
       'hsa_distance_1_1_x', 'hsa_distance_1_2_x', 'future_purchase',
       'next_purchase', 'purchase_timing', 'mode_freq_1', 'mode_freq_2',
       'mode_freq_3', 'mode_freq_4', 'mode_freq_5', 'mode_freq_6',
       'mode_freq_7', 'mode_freq_8', 'mode_freq_9', 'mode_freq_10',
       'mode_freq_11', 'mode_freq_12', 'autonomous_rideshare', 'housing_1_8_x',
       'solar_future'],
      dtype='object')

In [8]:
# ml_input_df.columns

## Clean Data

In [6]:
#columns to drop.
ml_input_df =ml_input_df.drop(['charge_spots','home_parking_1', 'home_parking_2', 'home_parking_3', 'home_parking_4',
       'home_parking_5', 'home_parking_6', 'home_parking_7', 'home_parking_8',
       'home_parking_9', 'home_parking_8_x','home_electricity_access','sampno','charge_work'],axis=1)
# maybe these are okay.
ml_input_df=ml_input_df.drop(['bev_experience', 'phev_experience','past_bev',
                              'past_phev'],axis=1)
list(ml_input_df)

['gender',
 'employment',
 'student',
 'education',
 'drive_freq',
 'transit_freq',
 'tnc_freq',
 'job_type',
 'work_mode',
 'work_distance',
 'work_days',
 'school_mode',
 'school_distance',
 'age_grp',
 'california',
 'county',
 'region',
 'future_decision_role',
 'num_hh_vehicles',
 'household_members_1',
 'household_members_2',
 'household_members_3',
 'household_members_4',
 'tot_hh_members',
 'hybrid_experience',
 'past_hybrid',
 'fcv_experience',
 'past_fcv',
 'hydrogen_station_awareness',
 'hsa_distance_1_1_x',
 'hsa_distance_1_2_x',
 'future_purchase',
 'next_purchase',
 'purchase_timing',
 'modes_used_1',
 'modes_used_2',
 'modes_used_3',
 'modes_used_4',
 'modes_used_5',
 'modes_used_6',
 'modes_used_7',
 'modes_used_8',
 'modes_used_9',
 'modes_used_10',
 'modes_used_11',
 'modes_used_12',
 'mode_freq_1',
 'mode_freq_2',
 'mode_freq_3',
 'mode_freq_4',
 'mode_freq_5',
 'mode_freq_6',
 'mode_freq_7',
 'mode_freq_8',
 'mode_freq_9',
 'mode_freq_10',
 'mode_freq_11',
 'mode_fr

## Split Data into Training and Testing

In [7]:
# check the datatypes
ml_input_df.dtypes

gender            int64
employment        int64
student           int64
education         int64
drive_freq       object
                  ...  
housing_1_8_x    object
solar             int64
solar_future     object
income            int64
ev_flag           int64
Length: 77, dtype: object

In [8]:
# Generate our categorical variable lists
ml_input_categorical = ml_input_df.dtypes[ml_input_df.dtypes == "object"].index.tolist()
ml_input_categorical

['drive_freq',
 'job_type',
 'work_mode',
 'work_distance',
 'work_days',
 'school_mode',
 'school_distance',
 'hybrid_experience',
 'past_hybrid',
 'fcv_experience',
 'past_fcv',
 'hsa_distance_1_1_x',
 'hsa_distance_1_2_x',
 'future_purchase',
 'next_purchase',
 'purchase_timing',
 'mode_freq_1',
 'mode_freq_2',
 'mode_freq_3',
 'mode_freq_4',
 'mode_freq_5',
 'mode_freq_6',
 'mode_freq_7',
 'mode_freq_8',
 'mode_freq_9',
 'mode_freq_10',
 'mode_freq_11',
 'mode_freq_12',
 'autonomous_rideshare',
 'housing_1_8_x',
 'solar_future']

In [9]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(ml_input_df[ml_input_categorical]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(ml_input_categorical)
encode_df.head()

Unnamed: 0,drive_freq_,drive_freq_1,drive_freq_2,drive_freq_3,drive_freq_4,job_type_,job_type_1,job_type_2,job_type_3,job_type_4,...,housing_1_8_x_attached single family house,housing_1_8_x_casa,housing_1_8_x_extra room,housing_1_8_x_family home,housing_1_8_x_granny flat,housing_1_8_x_guest house,housing_1_8_x_manufactured sfr,solar_future_,solar_future_1,solar_future_2
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
# Merge one-hot encoded features and drop the originals
ml_input_final = ml_input_df.merge(encode_df,left_index=True,right_index=True).drop(ml_input_categorical,1)
ml_input_final

Unnamed: 0,gender,employment,student,education,transit_freq,tnc_freq,age_grp,california,county,region,...,housing_1_8_x_attached single family house,housing_1_8_x_casa,housing_1_8_x_extra room,housing_1_8_x_family home,housing_1_8_x_granny flat,housing_1_8_x_guest house,housing_1_8_x_manufactured sfr,solar_future_,solar_future_1,solar_future_2
0,1,4,4,6,0,0,4,1,38,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,4,4,8,0,0,4,1,30,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,4,4,8,0,0,4,1,30,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,4,4,6,0,0,4,1,7,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,4,4,6,0,0,4,1,7,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17835,2,1,4,8,0,0,2,1,19,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17836,2,1,4,8,0,0,2,1,19,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17837,2,1,4,8,0,0,2,1,19,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17838,1,1,4,2,0,0,2,1,1,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Build our Model

### Predict EV Ownership Based on Personal Demographics

In [11]:
# Split our preprocessed data into our features and target arrays
X=ml_input_final.drop(columns='ev_flag', axis='columns')
y=ml_input_final['ev_flag']

In [12]:
print(X.shape)

(17840, 470)


In [13]:
X[:10]

Unnamed: 0,gender,employment,student,education,transit_freq,tnc_freq,age_grp,california,county,region,...,housing_1_8_x_attached single family house,housing_1_8_x_casa,housing_1_8_x_extra room,housing_1_8_x_family home,housing_1_8_x_granny flat,housing_1_8_x_guest house,housing_1_8_x_manufactured sfr,solar_future_,solar_future_1,solar_future_2
0,1,4,4,6,0,0,4,1,38,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,4,4,8,0,0,4,1,30,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,4,4,8,0,0,4,1,30,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,4,4,6,0,0,4,1,7,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,4,4,6,0,0,4,1,7,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,2,4,4,6,0,0,4,1,7,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,2,4,4,6,0,0,4,1,7,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,2,4,1,4,0,2,2,1,19,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,1,1,4,3,0,0,2,1,19,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,2,4,4,1,1,0,2,1,19,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [15]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(13380, 470)
(4460, 470)
(13380,)
(4460,)


In [16]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Decision Tree Model

#### Fit the Decision Tree Model

In [17]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

#### Make Predictions with the Decision Tree Model

In [18]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

#### Evaluate the Model

In [19]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3977,149
Actual 1,265,69


In [21]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.9071748878923767

In [22]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3977,149
Actual 1,265,69


Accuracy Score : 0.9071748878923767
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      4126
           1       0.32      0.21      0.25       334

    accuracy                           0.91      4460
   macro avg       0.63      0.59      0.60      4460
weighted avg       0.89      0.91      0.90      4460



### Random Forest Model

#### Fit the Random Forest Model

In [23]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [24]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

#### Make Predictions Random Forest Model

In [25]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

#### Evaluate the Random Forest Model

In [26]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3961,165
Actual 1,266,68


In [27]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.9033632286995515

In [28]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3961,165
Actual 1,266,68


Accuracy Score : 0.9033632286995515
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      4126
           1       0.29      0.20      0.24       334

    accuracy                           0.90      4460
   macro avg       0.61      0.58      0.59      4460
weighted avg       0.89      0.90      0.90      4460



#### Rank the Importance of Features

In [29]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([2.01112927e-02, 1.17122302e-02, 3.66911423e-03, 2.44623456e-02,
       7.99700609e-03, 7.16857648e-03, 7.86801609e-03, 0.00000000e+00,
       2.19482276e-02, 1.49732787e-02, 1.04900732e-02, 1.30008896e-02,
       6.28709228e-03, 7.62766407e-03, 6.43589903e-03, 9.83021957e-03,
       1.37219300e-02, 1.15062816e-02, 1.12116341e-02, 8.82637984e-03,
       1.16428811e-02, 1.08191104e-02, 7.98245968e-03, 8.61994814e-03,
       9.10085020e-03, 9.94299845e-03, 1.10257933e-02, 1.05737163e-02,
       1.08052843e-02, 1.09369259e-02, 1.89808609e-02, 1.34179332e-02,
       1.57884131e-02, 1.29907102e-02, 1.51460988e-02, 1.25873483e-02,
       5.13758998e-03, 1.44559115e-02, 1.33147808e-02, 2.08538405e-02,
       1.34736711e-02, 1.58381953e-02, 6.09902918e-03, 1.05780769e-02,
       2.19680336e-02, 2.39335321e-03, 8.08851198e-03, 6.82746732e-03,
       1.79558524e-03, 1.09217014e-03, 5.22828934e-03, 6.87359598e-03,
       3.68908055e-03, 3.96557236e-03, 1.62835697e-03, 5.02502282e-03,
      

In [30]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.024462345598569905, 'education'),
 (0.021968033634811472, 'income'),
 (0.021948227603931447, 'county'),
 (0.020853840522602918, 'autonomous_hhveh'),
 (0.020111292731042126, 'gender'),
 (0.019382339696566, 'hybrid_experience_1'),
 (0.018980860852422156, 'autonomous_aware'),
 (0.015838195301964605, 'autonomous_pref'),
 (0.015788413091257104, 'autonomous_att_2'),
 (0.015146098789360449, 'autonomous_att_4'),
 (0.01497327869787158, 'region'),
 (0.014455911452808293, 'autonomous_att_7'),
 (0.013721930033291477, 'tot_hh_members'),
 (0.013473671140496456, 'autonomous_pooled'),
 (0.013417933178378078, 'autonomous_att_1'),
 (0.013314780780236481, 'autonomous_att_8'),
 (0.013000889621273229, 'num_hh_vehicles'),
 (0.012990710166828448, 'autonomous_att_3'),
 (0.012587348329694657, 'autonomous_att_5'),
 (0.011712230159112593, 'employment'),
 (0.011642881133427692, 'modes_used_3'),
 (0.011506281607745961, 'hydrogen_station_awareness'),
 (0.011211634089255637, 'modes_used_1'),
 (0.0110257932988001