In [None]:
import os
import warnings                                  # `do not disturbe` mode
warnings.filterwarnings('ignore')

import numpy as np                               # vectors and matrices
import pandas as pd                              # tables and data manipulations
import matplotlib.pyplot as plt                  # plots
                         # more plots

from dateutil.relativedelta import relativedelta # working with dates with style
from scipy.optimize import minimize              # for function minimization

import statsmodels.formula.api as smf            # statistics and econometrics
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
from statsmodels.tsa.seasonal import seasonal_decompose

from itertools import product                    # some useful functions
from tqdm import tqdm_notebook

import seaborn as sns
sns.set_style(
    style='whitegrid', 
    rc={'axes.facecolor': '.95', 'grid.color': '.95'}
)
# sns.set_palette(palette='deep')
%matplotlib inline
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

os.environ['PYTHONHASHSEED'] = str(2)
np.random.seed(2)

In [None]:
df = pd.read_csv('values_office.csv')
df.head()

In [None]:
df

In [None]:
estimation_data = df[df.label == -1]

In [None]:
train_data = df[df.label != -1 ]

In [None]:
import plotly.express as px
fig = px.histogram(train_data, x="label")
fig.show()

In [None]:
train_data

In [None]:
train_data.time = pd.to_datetime(train_data.time)
estimation_data.time = pd.to_datetime(estimation_data.time)

In [None]:
train_data.set_index('time', inplace=True)
estimation_data.set_index('time', inplace=True)

In [None]:
train_data = train_data.resample('H').mean()
estimation_data = estimation_data.resample('H').mean().pad()

In [None]:
train_data.head()

In [None]:
import plotly.express as px
fig = px.histogram(train_data, x="label")
fig.show()

In [None]:
def level(x):
    res= 0
    if x < 0.5 :
        res = 0
    if 0.5 <= x:
        res =1
    if 1.5 <= x :
        res =2 
#     if 2.5 <= x:
#         res =3
#     if 3.5 <= x:
# #         res =4
    return res

In [None]:
# train_data.columns = ['Toffice_reference', 'humidity', 'detected_motions', 'power',
#        'office_CO2_concentration', 'door', 'occup']

In [None]:
# train_data['occup'] = train_data.label.apply(lambda x : 0 if x <= 0.5 else 1)
train_data['occup'] = train_data.label.apply(lambda x : level(x))

In [None]:
import plotly.express as px
fig = px.histogram(train_data, x="occup")
fig.show()

In [None]:
train_data.occup.value_counts()

In [None]:
# train_data.drop(['label'], axis =1, inplace=True)
# estimation_data.drop(['label'], axis =1, inplace=True)

In [None]:
#seperate features and target
target = "occup"
X = train_data.drop(target, axis =1)
Y = train_data.loc[:,target]

In [None]:
sns.boxplot(data=X, orient="v", palette="Set2")

In [None]:
# #normalize data
# X = (X - X.mean())/X.std()

In [None]:
X

In [None]:
sns.boxplot(data=X, orient="v", palette="Set2")

In [None]:
#pairplot
merged = X[:]
merged[target] = Y
sns.pairplot(merged, hue = target)

In [None]:
X.columns

In [None]:
#pairplot
merged = X[:][['power', 'detected_motions']]
merged[target] = Y
sns.pairplot(merged, hue = target)
plt.savefig('img/parplot-m.eps')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import GridSearchCV

In [None]:
#Split data into 80% training set and 20% testing set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state=5)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
#Build and train the model
svc_model = SVC()
svc_model.fit(X_train, y_train)

In [None]:
#predict the unseen test set and draw the confusion matrix
y_predict = svc_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)

In [None]:
print('Accuracy score: %.2f%%' %(accuracy_score(y_test, y_predict)*100))  
#print('Precision score: %.2f%%' % (precision_score(y_test, y_predict)*100))
#print('Recall score: %.2f%%' % (recall_score(y_test, y_predict)*100))

In [None]:
sns.heatmap(cm, annot=True, fmt='g').set(title='Confusion matrix')

In [None]:
# #Plot the confusion matrix and the roc curve
# fig, [ax1, ax2] = plt.subplots(1,2)
# fpr, tpr, _ = roc_curve(y_test,y_predict)
# ax1.plot(fpr, tpr, lw = 2, label = 'AUC: {:.2f}'.format(auc(fpr, tpr)))
# ax1.plot([0, 1], [0, 1],
#             linestyle = '--',
#             color = (0.6, 0.6, 0.6),
#             label = 'Random guessing')
# ax1.plot([0, 0, 1], [0, 1, 1],
#             linestyle = ':',
#             color = 'black', 
#             label = 'Perfect performance')
# ax1.set_xlim([-0.05, 1.05])
# ax1.set_ylim([-0.05, 1.05])
# ax1.set_xlabel('False Positive Rate (FPR)')
# ax1.set_ylabel('True Positive Rate (TPR)')
# ax1.set_title('Receiver Operator Characteristic (ROC) Curve')
# ax1.legend(loc = "lower right")
# fig.tight_layout() 
# sns.heatmap(cm, annot=True, fmt='g', ax=ax2).set(title='Confusion matrix')
# # -----------------------------------------------------
# plt.show()


In [None]:
### Improoving the model
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf']} 

In [None]:
#improoving the model using grid search to find the best parameters
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=0)

In [None]:
%%time
grid.fit(X_train,y_train)

In [None]:
#Predixt the classes of the test set
grid_predictions = grid.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, grid_predictions)

In [None]:
print('Accuracy score: %.2f%%' %(accuracy_score(y_test, grid_predictions)*100))  
# print('Precision score: %.2f%%' % (precision_score(y_test, grid_predictions)*100))
# print('Recall score: %.2f%%' % (recall_score(y_test, grid_predictions)*100))

In [None]:
sns.heatmap(cm, annot=True, fmt='g').set(title='Confusion matrix')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, grid_predictions, digits=4 ))

In [None]:
### Known classification models:

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV

def run_classifier(clf, param_grid, title, X_train, X_test):
    # -----------------------------------------------------
    cv = StratifiedKFold(n_splits= 3, shuffle = True, random_state= 123)
    # Randomized grid search
    n_iter_search = 10
    gs = RandomizedSearchCV(clf, 
                            param_distributions = param_grid,
                            n_iter = n_iter_search, 
                            cv = cv, 
                            iid = False,
                            scoring= 'accuracy',
                            verbose=1,
                            n_jobs=-1)
    # -----------------------------------------------------
    # Train model
    gs.fit(X_train, y_train)  
    print("The best parameters are %s" % (gs.best_params_)) 
    # Predict on test set
    y_pred = gs.best_estimator_.predict(X_test)
    # Get Probability estimates
    y_prob = gs.best_estimator_.predict_proba(X_test)[:, 1]
    # -----------------------------------------------------
#     print('Accuracy score: %.2f%%' %(accuracy_score(y_test, y_pred)*100))  
#     print('Precision score: %.2f%%' % (precision_score(y_test, y_pred)*100))
#     print('Recall score: %.2f%%' % (recall_score(y_test, y_pred)*100))
#     MSE=mean_squared_error(y_test, y_pred)
#     print("MSE", MSE)
    print(classification_report(y_test, y_pred, digits=4))
    # ----------------------------------------------------- 
    fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(21, 7))
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot = True, cbar = False, fmt = "d", linewidths = .5, cmap = "Blues", ax = ax1)
    ax1.set_title("Confusion Matrix")
    ax1.set_xlabel("Predicted class")
    ax1.set_ylabel("Actual class")
    fig.tight_layout()
    # -----------------------------------------------------
    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    ax2.plot(fpr, tpr, lw = 2, label = 'AUC: {:.2f}'.format(auc(fpr, tpr)))
    ax2.plot([0, 1], [0, 1],
             linestyle = '--',
             color = (0.6, 0.6, 0.6),
             label = 'Random guessing')
    ax2.plot([0, 0, 1], [0, 1, 1],
             linestyle = ':',
             color = 'black', 
             label = 'Perfect performance')
    ax2.set_xlim([-0.05, 1.05])
    ax2.set_ylim([-0.05, 1.05])
    ax2.set_xlabel('False Positive Rate (FPR)')
    ax2.set_ylabel('True Positive Rate (TPR)')
    ax2.set_title('Receiver Operator Characteristic (ROC) Curve')
    ax2.legend(loc = "lower right")
    fig.tight_layout()      
    # -----------------------------------------------------
    plt.show()

#### LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

param_grid = {'penalty': ['l2'],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

In [None]:
run_classifier(lr, param_grid, 'Logistic Regression', X_train, X_test)

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()

param_grid = {'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'max_depth': np.arange(1, 20, 2),
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4, 10],
              'max_features': ['auto', 'sqrt', 'log2', None]}

In [None]:
run_classifier(dtree, param_grid, "Decision Tree", X_train, X_test)

#### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

param_grid = {'n_estimators': [100, 200],
              'max_depth': [10, 20, 100, None],
              'max_features': ['auto', 'sqrt', None],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4, 10],
              'bootstrap': [True, False],
              'criterion': ['gini', 'entropy']}

In [None]:
run_classifier(rf, param_grid, 'Random Forest', X_train, X_test)


### Final 


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

param_grid = {'n_estimators': [100, 200],
              'max_depth': [10, 20, 100, None],
              'max_features': ['auto', 'sqrt', None],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4, 10],
              'bootstrap': [True, False],
              'criterion': ['gini', 'entropy']}

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV

def run_classifier_2(clf, param_grid, title, X_train, X_test):
    # -----------------------------------------------------
    cv = StratifiedKFold(n_splits= 3, shuffle = True, random_state= 123)
    # Randomized grid search
    n_iter_search = 10
    gs = RandomizedSearchCV(clf, 
                            param_distributions = param_grid,
                            n_iter = n_iter_search, 
                            cv = cv, 
                            iid = False,
                            scoring= 'accuracy',
                            verbose=1,
                            n_jobs=-1)
    # -----------------------------------------------------
    # Train model
    gs.fit(X_train, y_train)  
    print("The best parameters are %s" % (gs.best_params_)) 
    # Predict on test set
    y_pred = gs.best_estimator_.predict(X_test)
    # Get Probability estimates
    y_prob = gs.best_estimator_.predict_proba(X_test)[:, 1]
    # -----------------------------------------------------
    print('Accuracy score: %.2f%%' %(accuracy_score(y_test, y_pred)*100))  
#     print('Precision score: %.2f%%' % (precision_score(y_test, y_pred)*100))
#     print('Recall score: %.2f%%' % (recall_score(y_test, y_pred)*100))
    # ----------------------------------------------------- 
    fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(21, 7))
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot = True, cbar = False, fmt = "d", linewidths = .5, cmap = "Blues", ax = ax1)
    ax1.set_title("Confusion Matrix")
    ax1.set_xlabel("Predicted class")
    ax1.set_ylabel("Actual class")
    fig.tight_layout()
    # -----------------------------------------------------
#     # Plot ROC curve
#     fpr, tpr, _ = roc_curve(y_test, y_prob)
#     ax2.plot(fpr, tpr, lw = 2, label = 'AUC: {:.2f}'.format(auc(fpr, tpr)))
#     ax2.plot([0, 1], [0, 1],
#              linestyle = '--',
#              color = (0.6, 0.6, 0.6),
#              label = 'Random guessing')
#     ax2.plot([0, 0, 1], [0, 1, 1],
#              linestyle = ':',
#              color = 'black', 
#              label = 'Perfect performance')
#     ax2.set_xlim([-0.05, 1.05])
#     ax2.set_ylim([-0.05, 1.05])
#     ax2.set_xlabel('False Positive Rate (FPR)')
#     ax2.set_ylabel('True Positive Rate (TPR)')
#     ax2.set_title('Receiver Operator Characteristic (ROC) Curve')
#     ax2.legend(loc = "lower right")
    fig.tight_layout()      
    # -----------------------------------------------------
    plt.show()
    return gs

In [None]:
rf = run_classifier_2(rf, param_grid, 'Random Forest', X_train, X_test)


In [None]:
predict = rf.best_estimator_.predict(estimation_data)

In [None]:
pred = rf.best_estimator_.predict(X_test)

In [None]:

sns.scatterplot(x=y_test, y=pred)
plt.xlabel('Real Occupancy')
plt.ylabel('Estimated Occupancy')
# plt.savefig('img/estimationn.eps')


In [None]:
from sklearn.metrics import classification_report

print(classification_report(final_test, predict, digits=4))

In [None]:
estimation_data['occup'] = predict
# estimation_data['time'] = estim_time

In [None]:
estimation_data.occup.value_counts()

In [None]:
train_data

In [None]:
estimation_data

In [None]:
frames = [train_data, estimation_data]

final_df =  pd.concat(frames)

In [None]:
final_df

In [None]:
final_df.occup.value_counts()

In [None]:
import plotly.express as px
fig = px.histogram(final_df, x="occup")
fig.show()

In [None]:
# final_df.to_csv('ob-occupancy-bin.csv')

In [None]:
final_df