In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_curve, auc, roc_auc_score
from collections import defaultdict
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

In [2]:
#Importing the dataset 
df=pd.read_csv("/home/umair/Desktop/Data Science and BI/data mining/occupancy_data/DataCleaned.csv")
test=pd.read_csv("/home/umair/Desktop/Data Science and BI/data mining/occupancy_data/DataCleanedTest.csv")
test2=pd.read_csv("/home/umair/Desktop/Data Science and BI/data mining/occupancy_data/DataCleanedTest2.csv")

In [3]:
#Drop the useless column
df.drop(['Unnamed: 0'], axis=1, inplace =True)
df.drop(['date'], axis=1, inplace =True)
df.drop(['DayName'], axis=1, inplace =True)

test.drop(['Unnamed: 0'], axis=1, inplace =True)
test.drop(['date'], axis=1, inplace =True)
test.drop(['DayName'], axis=1, inplace =True)

test2.drop(['Unnamed: 0'], axis=1, inplace =True)
test2.drop(['date'], axis=1, inplace =True)
test2.drop(['DayName'], axis=1, inplace =True)

In [4]:
#preparing the two datasets splitting the attributes with the class and selecting the right attributes
attributes = [col for col in df.columns if col != 'Occupancy']
X_train = df[attributes]
y_train = df['Occupancy']

attributes = [col for col in test.columns if col != 'Occupancy']
X_test = test[attributes]
y_test = test['Occupancy']

attributes = [col for col in test2.columns if col != 'Occupancy']
X_test2 = test2[attributes]
y_test2 = test2['Occupancy']

In [5]:
#Avoid the warnings
import warnings
warnings.simplefilter("ignore")

In [6]:
#Simple bagging classifier with DecisionTree as estimator
clf = BaggingClassifier(base_estimator=None, n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.9478424015009381
F1-score [0.95953421 0.92664908]
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1693
           1       0.95      0.90      0.93       972

    accuracy                           0.95      2665
   macro avg       0.95      0.94      0.94      2665
weighted avg       0.95      0.95      0.95      2665



In [7]:
from sklearn.svm import SVC

In [8]:
#Simple bagging classifier with SVC and C=1000 as estimator
clf = BaggingClassifier(base_estimator=SVC(C=1000), n_estimators=10, random_state=0)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.9339587242026266
F1-score [0.94554455 0.91611058]
              precision    recall  f1-score   support

           0       0.99      0.90      0.95      1693
           1       0.85      0.99      0.92       972

    accuracy                           0.93      2665
   macro avg       0.92      0.95      0.93      2665
weighted avg       0.94      0.93      0.93      2665



In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC

features_combs_list = [
    ('IsWorkDay', 'Hour', 'IsWorkHour', 'Temperature', 'Humidity', 'Light', 'CO2'),
    #('IsWorkDay', 'Hour', 'IsWorkHour', 'Temperature', 'Light', 'CO2'),
    #('IsWorkHour', 'Temperature', 'Light', 'CO2'),
    #('Temperature', 'Light', 'CO2'),
    #('Light', 'CO2')
]

hyper_params_space = [
    {
        'base_estimator': [
            RandomForestClassifier(criterion="gini", max_depth=3, min_samples_split=2, min_samples_leaf=1,random_state=0),
            KNeighborsClassifier(n_neighbors=81, weights='uniform'),
            DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_split=2),
            GaussianNB(),
            #SVC(kernel='linear', C=0.1),
            #SVC(kernel='rbf', gamma=0.1)
        ],
        'n_estimators': [5, 10, 15, 30],
        'random_state': [0]
    },
]

print('Ready for the cicle')

for features in features_combs_list:
    print(features)
    print('===================================')
    X = X_train.loc[:, features]
    X_t = X_test.loc[:, features]
    X_t2 = X_test2.loc[:, features]
    
    print('Ready for the GridSearch')
    
    tree = GridSearchCV(BaggingClassifier(), hyper_params_space,
                       scoring='accuracy', n_jobs=-
                        1)
    tree.fit(X, y_train)
    
    print('Best parameters set:')
    print(tree.best_params_)
    print()
    
    preds = [
        (tree.predict(X), y_train, 'Train'),
        (tree.predict(X_t), y_test, 'Test1'),
        (tree.predict(X_t2), y_test2, 'Test2')
    ]
    
    for pred in preds:
        print(pred[2] + ' Classification Report:')
        print()
        print('Accuracy %s' % accuracy_score(pred[1], pred[0]))
        print()
        print(classification_report(pred[1], pred[0]))
        print()
        print(pred[2] + ' Confusion Matrix:')
        print(confusion_matrix(pred[1], pred[0]))
        print()

Ready for the cicle
('IsWorkDay', 'Hour', 'IsWorkHour', 'Temperature', 'Humidity', 'Light', 'CO2')
Ready for the GridSearch
Best parameters set:
{'base_estimator': GaussianNB(priors=None, var_smoothing=1e-09), 'n_estimators': 5, 'random_state': 0}

Train Classification Report:

Accuracy 0.9882107331450325

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      6414
           1       0.95      0.99      0.97      1729

    accuracy                           0.99      8143
   macro avg       0.98      0.99      0.98      8143
weighted avg       0.99      0.99      0.99      8143


Train Confusion Matrix:
[[6328   86]
 [  10 1719]]

Test1 Classification Report:

Accuracy 0.9786116322701689

              precision    recall  f1-score   support

           0       1.00      0.97      0.98      1693
           1       0.95      1.00      0.97       972

    accuracy                           0.98      2665
   macro avg       0.97      0.98  