In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict

# Data Preparation

In [None]:
df = pd.read_csv(r'C:/Users/ricky/Desktop/Scrivania/Data Mining/DM_2/dataframe_classification.csv',  header=0, parse_dates=True, squeeze=True)

In [None]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Hours,Office_Hours,Working_day,Time_blocks_0,Time_blocks_1,Time_blocks_2,Time_blocks_3,Occupancy
0,23.7,26.272,585.2,749.2,0.004764,14,1,1,0,0,1,0,1
1,23.718,26.29,578.4,760.4,0.004773,14,1,1,0,0,1,0,1
2,23.73,26.23,572.666667,769.666667,0.004765,14,1,1,0,0,1,0,1
3,23.7225,26.125,493.75,774.75,0.004744,14,1,1,0,0,1,0,1
4,23.754,26.2,488.6,779.0,0.004767,14,1,1,0,0,1,0,1


In [None]:
class_name = 'Occupancy'
columns2remove = ['Hours', 'Humidity']
df.drop(columns2remove, inplace=True, axis=1)

In [None]:
from data_preparation import prepare_dataset

def prepare_dataset(df, class_name):
    df = remove_missing_values(df)
    numeric_columns = get_numeric_columns(df)
    rdf = df.copy(deep=True)
    df, feature_names, class_values = one_hot_encoding(df, class_name)
    real_feature_names = get_real_feature_names(rdf, numeric_columns, class_name)
    rdf = rdf[real_feature_names + (class_values if isinstance(class_name, list) else [class_name])]
    features_map = get_features_map(feature_names, real_feature_names)

    return df, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map

def remove_missing_values(df):
    for column_name, nbr_missing in df.isna().sum().to_dict().items():
        if nbr_missing > 0:
            if column_name in df._get_numeric_data().columns:
                mean = df[column_name].mean()
                df[column_name].fillna(mean, inplace=True)
            else:
                mode = df[column_name].mode().values[0]
                df[column_name].fillna(mode, inplace=True)
    return df

def get_numeric_columns(df):
    numeric_columns = list(df._get_numeric_data().columns)
    return numeric_columns

def get_real_feature_names(rdf, numeric_columns, class_name):
    real_feature_names = [c for c in rdf.columns if c in numeric_columns and c != class_name]
    real_feature_names += [c for c in rdf.columns if c not in numeric_columns and c != class_name]
    return real_feature_names

def one_hot_encoding(df, class_name):
    dfX = pd.get_dummies(df[[c for c in df.columns if c != class_name]], prefix_sep='=')
    class_name_map = {v: k for k, v in enumerate(sorted(df[class_name].unique()))}
    dfY = df[class_name].map(class_name_map)
    df = pd.concat([dfX, dfY], axis=1)#, join_axes=[dfX.index])
    feature_names = list(dfX.columns)
    class_values = sorted(class_name_map)
    return df, feature_names, class_values

def get_features_map(feature_names, real_feature_names):
    features_map = defaultdict(dict)
    i = 0
    j = 0

    while i < len(feature_names) and j < len(real_feature_names):
        if feature_names[i] == real_feature_names[j]:
            features_map[j][feature_names[i]] = j
            i += 1
            j += 1
        elif feature_names[i].startswith(real_feature_names[j]):
            features_map[j][feature_names[i]] = j
            i += 1
        else:
            j += 1
    return features_map

In [None]:
res = prepare_dataset(df, class_name)
df, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map = res
df.head()

Unnamed: 0,Temperature,Light,CO2,HumidityRatio,Office_Hours,Working_day,Time_blocks_0,Time_blocks_1,Time_blocks_2,Time_blocks_3,Occupancy
0,23.7,585.2,749.2,0.004764,1,1,0,0,1,0,1
1,23.718,578.4,760.4,0.004773,1,1,0,0,1,0,1
2,23.73,572.666667,769.666667,0.004765,1,1,0,0,1,0,1
3,23.7225,493.75,774.75,0.004744,1,1,0,0,1,0,1
4,23.754,488.6,779.0,0.004767,1,1,0,0,1,0,1


In [None]:
from sklearn.preprocessing import MinMaxScaler
sc_X = MinMaxScaler()
columns = ['Temperature', 'Light', 'CO2', 'HumidityRatio']
sc_X.fit(np.array(df[columns]))
df[columns] = sc_X.transform(np.array(df[columns]))

# Data Partitioning

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score 

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [None]:
attributes = [col for col in df.columns if col != class_name]
X = df[attributes].values
y = df[class_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

In [None]:
X_train.shape

(14392, 10)

In [None]:
np.sqrt(71)

8.426149773176359

# Bagging

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

If None, then the base estimator is a decision tree.

In [None]:
# initialize the base classifier 
base_cls = DecisionTreeClassifier(criterion='entropy', max_depth= 10, 
                             min_samples_split=150, min_samples_leaf=150, random_state=42) 
  
# no. of base classifier 
num_trees = 500

In [None]:
clf = BaggingClassifier(base_estimator=base_cls, n_estimators=100, random_state=0)
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.9889753566796369
F1-score [0.99279356 0.97655172]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      4743
           1       0.96      0.99      0.98      1425

    accuracy                           0.99      6168
   macro avg       0.98      0.99      0.98      6168
weighted avg       0.99      0.99      0.99      6168



In [None]:
# initialize the base classifier 
base_cls = LogisticRegression(C=1000, max_iter=500, n_jobs=10, random_state=0, solver='lbfgs')

In [None]:
clf = BaggingClassifier(base_estimator=base_cls, n_estimators=100, random_state=0)
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.9904345006485085
F1-score [0.99374801 0.97964815]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      4743
           1       0.96      1.00      0.98      1425

    accuracy                           0.99      6168
   macro avg       0.98      0.99      0.99      6168
weighted avg       0.99      0.99      0.99      6168



In [None]:
# initialize the base classifier 
base_cls = KNeighborsClassifier(n_neighbors=7, weights='uniform')

In [None]:
clf = BaggingClassifier(base_estimator=base_cls, n_estimators=100, random_state=0)
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.9930285343709468
F1-score [0.99545887 0.98500174]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      4743
           1       0.98      0.99      0.99      1425

    accuracy                           0.99      6168
   macro avg       0.99      0.99      0.99      6168
weighted avg       0.99      0.99      0.99      6168



In [None]:
clf = BaggingClassifier(base_estimator=SVC(C=1000), n_estimators=10, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.9905966277561609
F1-score [0.99385333 0.98      ]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      4743
           1       0.96      1.00      0.98      1425

    accuracy                           0.99      6168
   macro avg       0.98      0.99      0.99      6168
weighted avg       0.99      0.99      0.99      6168



In [None]:
clf = BaggingClassifier(base_estimator=RandomForestClassifier(max_depth = 19, min_samples_split = 2, min_samples_leaf = 1, n_estimators=100, random_state=0))
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.9706549935149157
F1-score [0.9811045  0.93434893]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4743
           1       0.97      0.90      0.93      1425

    accuracy                           0.97      6168
   macro avg       0.97      0.95      0.96      6168
weighted avg       0.97      0.97      0.97      6168



In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
clf = BaggingClassifier()
scores = cross_val_score(clf, X, y, cv=10)

print('Accuracy %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Accuracy 0.943 +/- 0.056
