In [None]:
!ls

ACME-HappinessSurvey2020.csv  drive  sample_data


In [1]:
#loading in libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import itertools #using this for the combinatorial feature selection process

In [2]:
#importing dataset
data = pd.read_csv('ACME-HappinessSurvey2020.csv')

print(data.head(5))

   Y  X1  X2  X3  X4  X5  X6
0  0   3   3   3   4   2   4
1  0   3   2   3   5   4   3
2  1   5   3   3   3   3   5
3  0   5   4   3   3   3   5
4  0   5   4   3   3   3   5


In [3]:
#splitting X and y
X_columns = data.iloc[:, 1:].columns
print(X_columns)

X = pd.DataFrame(data, columns=X_columns)
y = data.iloc[:, 0].values

# splitting into train / test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)

Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6'], dtype='object')
(100, 6)
(100,)


In [4]:
print(X.head(5))

print(data.head(5))

print(data.shape)

print(X.shape)

print(y.shape)


   X1  X2  X3  X4  X5  X6
0   3   3   3   4   2   4
1   3   2   3   5   4   3
2   5   3   3   3   3   5
3   5   4   3   3   3   5
4   5   4   3   3   3   5
   Y  X1  X2  X3  X4  X5  X6
0  0   3   3   3   4   2   4
1  0   3   2   3   5   4   3
2  1   5   3   3   3   3   5
3  0   5   4   3   3   3   5
4  0   5   4   3   3   3   5
(126, 7)
(126, 6)
(126,)


**Setting up Combinatorial Feature Selection next!!**  
the hope is to go through every combination of features available in the dataset for all 3 of the models

I'm not sure if I should do hyperparameter tuning first, but I will do the feature selection on relatively baseline models

In [5]:
sgd_results = []
logreg_results = []
rfc_results = []
# mm_sgd_results = []

#looping over feature combinations
for f in range (0, len(X.columns)+1):
  for subset in itertools.combinations(X.columns, f):
    if len(subset) == 0:
      continue #avoiding empty feature set

    # setting up my models
    SGD = SGDClassifier(loss='hinge')
    logreg = LogisticRegression()
    rfc = RandomForestClassifier(n_estimators=100, random_state=224)

    #perform cross-val
    score_sgd = cross_val_score(SGD, X[list(subset)], y, cv=6).mean()
    score_logreg = cross_val_score(logreg, X[list(subset)], y, cv=6).mean()
    score_rfc = cross_val_score(rfc, X[list(subset)], y, cv=6).mean()
   # mm_score_sgd = cross_val_score(SGD, X_scale_minmax[list(subset)], y, cv=6).mean()

    #save feature combination and mean cross-val score
    sgd_results.append({
        'features': subset,
        'score': score_sgd})

    logreg_results.append({
        'features': subset,
        'score': score_logreg})

    rfc_results.append({
        'features': subset,
        'score': score_rfc})

print("Itertools loop done!")

Itertools loop done!


In [None]:
sgd_results

[{'features': ('X1',), 'score': 0.5158730158730157},
 {'features': ('X2',), 'score': 0.49999999999999994},
 {'features': ('X3',), 'score': 0.5714285714285714},
 {'features': ('X4',), 'score': 0.48412698412698413},
 {'features': ('X5',), 'score': 0.4682539682539682},
 {'features': ('X6',), 'score': 0.49206349206349204},
 {'features': ('X1', 'X2'), 'score': 0.5396825396825397},
 {'features': ('X1', 'X3'), 'score': 0.5158730158730158},
 {'features': ('X1', 'X4'), 'score': 0.5476190476190476},
 {'features': ('X1', 'X5'), 'score': 0.5555555555555555},
 {'features': ('X1', 'X6'), 'score': 0.5317460317460316},
 {'features': ('X2', 'X3'), 'score': 0.5317460317460316},
 {'features': ('X2', 'X4'), 'score': 0.45238095238095233},
 {'features': ('X2', 'X5'), 'score': 0.5396825396825397},
 {'features': ('X2', 'X6'), 'score': 0.484126984126984},
 {'features': ('X3', 'X4'), 'score': 0.5},
 {'features': ('X3', 'X5'), 'score': 0.5476190476190476},
 {'features': ('X3', 'X6'), 'score': 0.5476190476190476}

In [None]:
logreg_results

[{'features': ('X1',), 'score': 0.6349206349206349},
 {'features': ('X2',), 'score': 0.5317460317460317},
 {'features': ('X3',), 'score': 0.5555555555555555},
 {'features': ('X4',), 'score': 0.49999999999999994},
 {'features': ('X5',), 'score': 0.5952380952380952},
 {'features': ('X6',), 'score': 0.6031746031746031},
 {'features': ('X1', 'X2'), 'score': 0.626984126984127},
 {'features': ('X1', 'X3'), 'score': 0.5873015873015873},
 {'features': ('X1', 'X4'), 'score': 0.6349206349206349},
 {'features': ('X1', 'X5'), 'score': 0.5793650793650794},
 {'features': ('X1', 'X6'), 'score': 0.6031746031746031},
 {'features': ('X2', 'X3'), 'score': 0.5714285714285714},
 {'features': ('X2', 'X4'), 'score': 0.5079365079365079},
 {'features': ('X2', 'X5'), 'score': 0.5714285714285715},
 {'features': ('X2', 'X6'), 'score': 0.5873015873015873},
 {'features': ('X3', 'X4'), 'score': 0.5634920634920635},
 {'features': ('X3', 'X5'), 'score': 0.5873015873015873},
 {'features': ('X3', 'X6'), 'score': 0.57936

In [None]:
rfc_results

[{'features': ('X1',), 'score': 0.6507936507936507},
 {'features': ('X2',), 'score': 0.5555555555555555},
 {'features': ('X3',), 'score': 0.5238095238095237},
 {'features': ('X4',), 'score': 0.5158730158730159},
 {'features': ('X5',), 'score': 0.5873015873015873},
 {'features': ('X6',), 'score': 0.6031746031746031},
 {'features': ('X1', 'X2'), 'score': 0.5714285714285713},
 {'features': ('X1', 'X3'), 'score': 0.6428571428571428},
 {'features': ('X1', 'X4'), 'score': 0.5873015873015873},
 {'features': ('X1', 'X5'), 'score': 0.6428571428571428},
 {'features': ('X1', 'X6'), 'score': 0.6428571428571428},
 {'features': ('X2', 'X3'), 'score': 0.4603174603174603},
 {'features': ('X2', 'X4'), 'score': 0.5476190476190476},
 {'features': ('X2', 'X5'), 'score': 0.611111111111111},
 {'features': ('X2', 'X6'), 'score': 0.48412698412698413},
 {'features': ('X3', 'X4'), 'score': 0.5317460317460317},
 {'features': ('X3', 'X5'), 'score': 0.5317460317460316},
 {'features': ('X3', 'X6'), 'score': 0.53968

In [6]:
# printing the highest score for each model dictionary
print('SGD: ', max(sgd_results, key=lambda x: x['score']))
print('LogReg: ', max(logreg_results, key=lambda x: x['score']))
print('RandForest: ', max(rfc_results, key=lambda x: x['score']))


SGD:  {'features': ('X2', 'X4', 'X5'), 'score': 0.5714285714285715}
LogReg:  {'features': ('X1',), 'score': 0.6349206349206349}
RandForest:  {'features': ('X1', 'X3', 'X6'), 'score': 0.7063492063492064}


I want to get the SGD model with MinMax features to work now!

In [None]:
# creating the dataframe of features with MinMax entry values

X = data.drop(columns=['Y']) #drop the Y column

mm_scale = MinMaxScaler() #initialize MinMax Scaler

mm_X_scaled = mm_scale.fit_transform(X)

mm_X_df = pd.DataFrame(mm_X_scaled, columns=X.columns)

print(mm_X_df.head(10))



     X1    X2    X3    X4    X5    X6
0  0.50  0.50  0.50  0.75  0.25  0.75
1  0.50  0.25  0.50  1.00  0.75  0.50
2  1.00  0.50  0.50  0.50  0.50  1.00
3  1.00  0.75  0.50  0.50  0.50  1.00
4  1.00  0.75  0.50  0.50  0.50  1.00
5  1.00  1.00  0.50  1.00  1.00  1.00
6  0.50  0.00  0.25  0.25  0.00  0.50
7  1.00  0.75  0.75  0.75  0.75  1.00
8  0.75  0.00  0.75  0.75  0.75  0.75
9  0.75  0.75  0.75  0.25  1.00  1.00


In [None]:
mm_sgd_results = [] #dictionary to save feature combination results

#looping over feature combinations
for f in range (0, len(mm_X_df.columns)+1):
  for subset in itertools.combinations(mm_X_df.columns, f):
    if len(subset) == 0:
      continue #avoiding empty feature set

    # setting up my models
    SGD_mm = SGDClassifier(loss='hinge')

    #perform cross-val
    score_sgd_mm = cross_val_score(SGD_mm, mm_X_df[list(subset)], y, cv=6).mean()
    # mm_score_sgd = cross_val_score(SGD, X_scale_minmax[list(subset)], y, cv=6).mean()

    #save feature combination and mean cross-val score
    mm_sgd_results.append({
        'features': subset,
        'score': score_sgd})


printing the highest accurary and its associated feature combination for the MinMax Scaled data and SDG model

In [None]:
mm_sgd_results

print('MinMax SGD: ', max(mm_sgd_results, key=lambda x: x['score']))

MinMax SGD:  {'features': ('X1',), 'score': 0.49206349206349204}


In [None]:
#testing LogReg and RandForest model performance when given the MinMax Scaled data

mm_logreg_results = []
mm_rfc_results = []

#looping over feature combinations
for f in range (0, len(mm_X_df.columns)+1):
  for subset in itertools.combinations(mm_X_df.columns, f):
    if len(subset) == 0:
      continue #avoiding empty feature set

    # setting up my models
    logreg = LogisticRegression()
    rfc = RandomForestClassifier(n_estimators=100, random_state=224)

    #perform cross-val
    mm_score_logreg = cross_val_score(logreg, mm_X_df[list(subset)], y, cv=6).mean()
    mm_score_rfc = cross_val_score(rfc, mm_X_df[list(subset)], y, cv=6).mean()
   # mm_score_sgd = cross_val_score(SGD, X_scale_minmax[list(subset)], y, cv=6).mean()

    #save feature combination and mean cross-val score
    mm_logreg_results.append({
        'features': subset,
        'score': score_logreg})

    mm_rfc_results.append({
        'features': subset,
        'score': score_rfc})

print("Itertools loop done!")

Itertools loop done!


In [None]:
# printing the highest score for each model

print('MinMax SGD: ', max(mm_sgd_results, key=lambda x: x['score']))
print('MinMax LogReg: ', max(mm_logreg_results, key=lambda x: x['score']))
print('MinMax RandForest: ', max(mm_rfc_results, key=lambda x: x['score']))


MinMax SGD:  {'features': ('X1',), 'score': 0.49206349206349204}
MinMax LogReg:  {'features': ('X1',), 'score': 0.5396825396825397}
MinMax RandForest:  {'features': ('X1',), 'score': 0.5555555555555556}


NEXT STEP:  
Combine the highest performing feature set with the highest performing hyperparameter spae for each model.   
the feature set it found above   
the hyperparameter set is found in the hyperopt python file

In [None]:
# Rand Forest hyperparams: 'criterion': ‘entropy’, 'max_depth': 3.0, 'max_features': ‘log2’, 'n_estimators': 70.0

# MinMax SGD hyperparams: 'alpha': 0.10328753530570639, 'loss': ‘squared_error’, 'max_iter': 240000.0, 'penalty': ‘l1’
#### need to increase max_iter for the MinMax SGD model by A LOT
# Reg Data SGD hyperparams: 'alpha': 0.009701678103409387, 'loss': ‘squared_hinge’, 'max_iter': 470000.0, 'penalty': ‘l2’

# no optim hyperparam for LogReg yet

## Rand Forest optim feats: ('X1', 'X3', 'X6')
## SGD optim feats: ('X1', 'X3', 'X5')
## LogReg optim feats:  ('X1')

X_rf = data[['X1', 'X3', 'X6']]
X_sgd = data[['X1', 'X3', 'X5']]
X_logreg = data[['X1']]

rfc_scores = []
sgd_scores = []
logreg_scores = []

for i in range(10):

  #create models
  rfc = RandomForestClassifier(criterion='entropy', max_depth=3, max_features='log2', n_estimators=70)
  sgd = SGDClassifier(alpha=0.1033, loss='squared_error', max_iter=500000, penalty='l1')
  logreg = LogisticRegression()

  #fit models and use cross_val
  rfc_score = cross_val_score(rfc, X_rf, y, cv=6).mean()
  sgd_score = cross_val_score(sgd, X_sgd, y, cv=6).mean()
  logreg_score = cross_val_score(logreg, X_logreg, y, cv=6).mean()

  rfc_scores.append(rfc_score)
  sgd_scores.append(sgd_score)
  logreg_scores.append(logreg_score)

print("Random Forest Scores: ", rfc_scores)
print("SGD Classifier Scores: ", sgd_scores)
print("Logistic Regression scores: ", logreg_scores)





Random Forest Scores:  [0.6031746031746033, 0.611111111111111, 0.5952380952380952, 0.5952380952380952, 0.6111111111111112, 0.6349206349206349, 0.6031746031746031, 0.6031746031746031, 0.5952380952380952, 0.626984126984127]
SGD Classifier Scores:  [0.42857142857142855, 0.5952380952380952, 0.49206349206349204, 0.4603174603174603, 0.48412698412698413, 0.5079365079365079, 0.4047619047619047, 0.6507936507936507, 0.5714285714285714, 0.373015873015873]
Logistic Regression scores:  [0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349]




In [None]:
print("Random Forest Scores: ", rfc_scores)
print("SGD Classifier Scores: ", sgd_scores)
print("Logistic Regression scores: ", logreg_scores)

Random Forest Scores:  [0.626984126984127, 0.6190476190476191, 0.6269841269841271, 0.6031746031746031, 0.6031746031746031, 0.6190476190476191, 0.6269841269841269, 0.6269841269841271, 0.6349206349206349, 0.5873015873015873]
SGD Classifier Scores:  [0.3650793650793651, 0.4761904761904761, 0.49206349206349204, 0.4365079365079365, 0.4841269841269842, 0.373015873015873, 0.5476190476190476, 0.34920634920634924, 0.43650793650793646, 0.5476190476190476]
Logistic Regression scores:  [0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349, 0.6349206349206349]


In [None]:
# print the max of each model array
print('RandForest: ', max(rfc_scores))
print('SGD: ', max(sgd_scores))
print('LogReg: ', max(logreg_scores))

RandForest:  0.6349206349206349
SGD:  0.5476190476190476
LogReg:  0.6349206349206349


**Now running Itertools Feature Combination on the 2
Hyperparameter tuned models**

Using the optimal hyperparameter setup that was found during HyperOpt for SGD Classifier and Random Forest Classifier
and then running through all feature combinations to find the highest accuracy

In [12]:
# using the same loop as the first iterloop

###### optimal hyperparameters for each model:
# Rand Forest hyperparams: 'criterion': ‘entropy’, 'max_depth': 3.0, 'max_features': ‘log2’, 'n_estimators': 70.0
# Reg Data SGD hyperparams: 'alpha': 0.009701678103409387, 'loss': ‘squared_hinge’, 'max_iter': 470000.0, 'penalty': ‘l2’

hyp_sgd_results = []
hyp_rfc_results = []

#looping over feature combinations
for f in range (0, len(X.columns)+1):
  for subset in itertools.combinations(X.columns, f):

    if len(subset) == 0:
      continue #avoiding empty feature set

    # setting up my models
    rfc = RandomForestClassifier(criterion='entropy', max_depth=3, max_features='log2', n_estimators=70)
    #sgd = SGDClassifier(alpha=0.1033, loss='squared_error', max_iter=10000000, penalty='l1')

    #perform cross-val
    #score_sgd = cross_val_score(sgd, X[list(subset)], y, cv=6).mean()
    score_rfc = cross_val_score(rfc, X[list(subset)], y, cv=6).mean()

    #save feature combination and mean cross-val score
    #hyp_sgd_results.append({
    #    'features': subset,
    #    'score': score_sgd})

    hyp_rfc_results.append({
        'features': subset,
        'score': score_rfc})

print("Itertools loop done!")


Itertools loop done!


In [13]:
print(hyp_rfc_results)

[{'features': ('X1',), 'score': 0.619047619047619}, {'features': ('X2',), 'score': 0.5317460317460317}, {'features': ('X3',), 'score': 0.5158730158730158}, {'features': ('X4',), 'score': 0.5158730158730159}, {'features': ('X5',), 'score': 0.5317460317460317}, {'features': ('X6',), 'score': 0.6031746031746031}, {'features': ('X1', 'X2'), 'score': 0.5952380952380952}, {'features': ('X1', 'X3'), 'score': 0.611111111111111}, {'features': ('X1', 'X4'), 'score': 0.6111111111111112}, {'features': ('X1', 'X5'), 'score': 0.6587301587301587}, {'features': ('X1', 'X6'), 'score': 0.5952380952380952}, {'features': ('X2', 'X3'), 'score': 0.4603174603174603}, {'features': ('X2', 'X4'), 'score': 0.5079365079365079}, {'features': ('X2', 'X5'), 'score': 0.5555555555555555}, {'features': ('X2', 'X6'), 'score': 0.5555555555555555}, {'features': ('X3', 'X4'), 'score': 0.5079365079365079}, {'features': ('X3', 'X5'), 'score': 0.5238095238095238}, {'features': ('X3', 'X6'), 'score': 0.5555555555555555}, {'fea

In [15]:
print('RandForest: ', max(hyp_rfc_results, key=lambda x: x['score']))


RandForest:  {'features': ('X1', 'X5'), 'score': 0.6587301587301587}
