In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_curve, auc, roc_auc_score
from collections import defaultdict
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV

In [2]:
#Importing the dataset 
df=pd.read_csv("/home/umair/Desktop/Data Science and BI/data mining/occupancy_data/DataCleaned.csv")
test=pd.read_csv("/home/umair/Desktop/Data Science and BI/data mining/occupancy_data/DataCleanedTest.csv")
test2=pd.read_csv("/home/umair/Desktop/Data Science and BI/data mining/occupancy_data/DataCleanedTest2.csv")

In [3]:
#Drop the useless column
df.drop(['Unnamed: 0'], axis=1, inplace =True)
df.drop(['date'], axis=1, inplace =True)
df.drop(['DayName'], axis=1, inplace =True)

test.drop(['Unnamed: 0'], axis=1, inplace =True)
test.drop(['date'], axis=1, inplace =True)
test.drop(['DayName'], axis=1, inplace =True)

test2.drop(['Unnamed: 0'], axis=1, inplace =True)
test2.drop(['date'], axis=1, inplace =True)
test2.drop(['DayName'], axis=1, inplace =True)

In [4]:
#preparing the two datasets splitting the attributes with the class and selecting the right attributes
attributes = [col for col in df.columns if 
              col == 'IsWorkDay' 
              or col == 'Hour' 
              or col == 'Light'
              or col == 'IsWorkHour'
              or col == 'Temperature' 
              or col == 'Humidity'
              or col == 'CO2'
              or col == 'HumidityRatio'
             ]
X_train = df[attributes].values
y_train = df['Occupancy']

attributes = [col for col in df.columns if 
              col == 'IsWorkDay' 
              or col == 'Hour' 
              or col == 'Light'
              or col == 'IsWorkHour'
              or col == 'Temperature' 
              or col == 'Humidity'
              or col == 'CO2'
              or col == 'HumidityRatio'
             ]
X_test = test[attributes].values
y_test = test['Occupancy']

attributes = [col for col in test2.columns if col != 'Occupancy']
X_test2 = test2[attributes].values
y_test2 = test2['Occupancy']

In [5]:
#Avoid the warnings
import warnings
warnings.simplefilter("ignore")

In [6]:
#normalization of the datasets using the StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)

scaler = StandardScaler()
X_test2 = scaler.fit_transform(X_test2)

In [7]:
#HyperTuning parameters with GridSearch of the kernel=linear crossing between training set, test set1
# and test set 2. Searching for the best C
features_combs_list = [
    ('IsWorkDay', 'Hour', 'IsWorkHour', 'Temperature', 'Humidity', 'Light', 'CO2')
]

hyper_params_space = [
    {
        'kernel': ['linear'],
        'C': np.arange(0.1, 5, 0.1)
    },
]

for features in features_combs_list:
    print(features)
    print('===================================')
    X = X_train
    X_t1 = X_test
    X_t2 = X_test2

    svc = GridSearchCV(SVC(), hyper_params_space,
                       scoring='accuracy')
    svc.fit(X, y_train)
    
    print('Best parameters set:')
    print(svc.best_params_)
    print()
    
    preds = [
        (svc.predict(X), y_train, 'Train'),
        (svc.predict(X_t1), y_test, 'Test1'),
        (svc.predict(X_t2), y_test2, 'Test2')
    ]
    
    for pred in preds:
        print('Accuracy %s' % accuracy_score(pred[1], pred[0]))
        print()
        print(pred[2] + ' Classification Report:')
        print()
        print(classification_report(pred[1], pred[0]))
        print()
        print(pred[2] + ' Confusion Matrix:')
        print(confusion_matrix(pred[1], pred[0]))
        print()

('IsWorkDay', 'Hour', 'IsWorkHour', 'Temperature', 'Humidity', 'Light', 'CO2')
Best parameters set:
{'C': 0.1, 'kernel': 'linear'}

Accuracy 0.9882107331450325

Train Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      6414
           1       0.95      0.99      0.97      1729

    accuracy                           0.99      8143
   macro avg       0.98      0.99      0.98      8143
weighted avg       0.99      0.99      0.99      8143


Train Confusion Matrix:
[[6328   86]
 [  10 1719]]

Accuracy 0.6352720450281426

Test1 Classification Report:

              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1693
           1       0.00      0.00      0.00       972

    accuracy                           0.64      2665
   macro avg       0.32      0.50      0.39      2665
weighted avg       0.40      0.64      0.49      2665


Test1 Confusion Matrix:
[[1693    0]
 [ 97

In [8]:
#HyperTuning parameters with GridSearch of the kernel=rbf crossing between training set, test set1
# and test set 2. Searching for the best gamma
features_combs_list = [
    ('IsWorkDay', 'Hour', 'IsWorkHour', 'Temperature', 'Humidity', 'Light', 'CO2')
]

hyper_params_space = [
    #{
    #    'kernel': ['linear'],
    #    'C': [0.1, 1, 10, 50, 100]
    #},
    {
        'kernel': ['rbf'],
        'gamma': np.arange(0.1, 5, 0.1)
    },
]

for features in features_combs_list:
    print(features)
    print('===================================')
    X = X_train
    X_t1 = X_test
    X_t2 = X_test2

    svc = GridSearchCV(SVC(), hyper_params_space,
                       scoring='accuracy')
    svc.fit(X, y_train)
    
    print('Best parameters set:')
    print(svc.best_params_)
    print()
    
    preds = [
        (svc.predict(X), y_train, 'Train'),
        (svc.predict(X_t1), y_test, 'Test1'),
        (svc.predict(X_t2), y_test2, 'Test2')
    ]
    
    for pred in preds:
        print('Accuracy %s' % accuracy_score(pred[1], pred[0]))
        print()
        print(pred[2] + ' Classification Report:')
        print()
        print(classification_report(pred[1], pred[0]))
        print()
        print(pred[2] + ' Confusion Matrix:')
        print(confusion_matrix(pred[1], pred[0]))
        print()

('IsWorkDay', 'Hour', 'IsWorkHour', 'Temperature', 'Humidity', 'Light', 'CO2')
Best parameters set:
{'gamma': 0.1, 'kernel': 'rbf'}

Accuracy 0.9885791477342503

Train Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      6414
           1       0.95      0.99      0.97      1729

    accuracy                           0.99      8143
   macro avg       0.98      0.99      0.98      8143
weighted avg       0.99      0.99      0.99      8143


Train Confusion Matrix:
[[6333   81]
 [  12 1717]]

Accuracy 0.9613508442776736

Test1 Classification Report:

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1693
           1       0.95      0.94      0.95       972

    accuracy                           0.96      2665
   macro avg       0.96      0.96      0.96      2665
weighted avg       0.96      0.96      0.96      2665


Test1 Confusion Matrix:
[[1648   45]
 [  

In [21]:
#HyperTuning parameters with GridSearch of the kernel=poly crossing between training set, test set1
# and test set 2. Searching for the best gamma and degree
features_combs_list = [
    ('IsWorkDay', 'Hour', 'IsWorkHour', 'Temperature', 'Humidity', 'Light', 'CO2')
]

hyper_params_space = [
    {
        'kernel': ['poly'],
        'gamma': np.arange(1, 10),
        'degree': [4]
    },
]

for features in features_combs_list:
    print(features)
    print('===================================')
    X = X_train
    X_t1 = X_test
    X_t2 = X_test2

    svc = GridSearchCV(SVC(), hyper_params_space,
                       scoring='accuracy')
    svc.fit(X, y_train)
    
    print('Best parameters set:')
    print(svc.best_params_)
    print()
    
    preds = [
        (svc.predict(X), y_train, 'Train'),
        (svc.predict(X_t1), y_test, 'Test1'),
        (svc.predict(X_t2), y_test2, 'Test2')
    ]
    
    for pred in preds:
        print('Accuracy %s' % accuracy_score(pred[1], pred[0]))
        print()
        print(pred[2] + ' Classification Report:')
        print()
        print(classification_report(pred[1], pred[0]))
        print()
        print(pred[2] + ' Confusion Matrix:')
        print(confusion_matrix(pred[1], pred[0]))
        print()

('IsWorkDay', 'Hour', 'IsWorkHour', 'Temperature', 'Humidity', 'Light', 'CO2')
Best parameters set:
{'degree': 4, 'gamma': 8, 'kernel': 'poly'}

Accuracy 0.9964386589708952

Train Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6414
           1       0.99      1.00      0.99      1729

    accuracy                           1.00      8143
   macro avg       0.99      1.00      0.99      8143
weighted avg       1.00      1.00      1.00      8143


Train Confusion Matrix:
[[6392   22]
 [   7 1722]]

Accuracy 0.8056285178236398

Test1 Classification Report:

              precision    recall  f1-score   support

           0       0.77      1.00      0.87      1693
           1       1.00      0.47      0.64       972

    accuracy                           0.81      2665
   macro avg       0.88      0.73      0.75      2665
weighted avg       0.85      0.81      0.78      2665


Test1 Confusion Matrix:
[[169