In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import os
import importlib
import seaborn as sns

from collections import Counter
from collections import defaultdict
from scipy.stats.stats import pearsonr
from pandas.plotting import scatter_matrix
from scipy.stats import norm

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [2]:
from collections import Counter

In [3]:
for dirname, _, filenames in os.walk(r'C:\Users\loren\Desktop\DM_II'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

C:\Users\loren\Desktop\DM_II\dataframe_classification.csv
C:\Users\loren\Desktop\DM_II\datatest.txt
C:\Users\loren\Desktop\DM_II\datatest2.txt
C:\Users\loren\Desktop\DM_II\datatraining.txt


In [4]:
class_name = 'Occupancy'
df = pd.read_csv(r'C:\Users\loren\Desktop\DM_II\dataframe_classification.csv', skipinitialspace=True, na_values='?', keep_default_na=True, index_col=0)

In [5]:
df.sample()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Hours,Office_Hours,Working_day,Time_blocks_0,Time_blocks_1,Time_blocks_2,Time_blocks_3,Occupancy
11117,21.0,30.066667,0.0,537.0,0.004625,19,0,1,0,0,0,1,0


In [6]:
def prepare_dataset(df, class_name):
    df = remove_missing_values(df)
    numeric_columns = get_numeric_columns(df)
    rdf = df.copy(deep=True)
    df, feature_names, class_values = one_hot_encoding(df, class_name)
    real_feature_names = get_real_feature_names(rdf, numeric_columns, class_name)
    rdf = rdf[real_feature_names + (class_values if isinstance(class_name, list) else [class_name])]
    features_map = get_features_map(feature_names, real_feature_names)

    return df, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map

def remove_missing_values(df):
    for column_name, nbr_missing in df.isna().sum().to_dict().items():
        if nbr_missing > 0:
            if column_name in df._get_numeric_data().columns:
                mean = df[column_name].mean()
                df[column_name].fillna(mean, inplace=True)
            else:
                mode = df[column_name].mode().values[0]
                df[column_name].fillna(mode, inplace=True)
    return df

def get_numeric_columns(df):
    numeric_columns = list(df._get_numeric_data().columns)
    return numeric_columns

def get_real_feature_names(rdf, numeric_columns, class_name):
    real_feature_names = [c for c in rdf.columns if c in numeric_columns and c != class_name]
    real_feature_names += [c for c in rdf.columns if c not in numeric_columns and c != class_name]
    return real_feature_names

def one_hot_encoding(df, class_name):
    dfX = pd.get_dummies(df[[c for c in df.columns if c != class_name]], prefix_sep='=')
    class_name_map = {v: k for k, v in enumerate(sorted(df[class_name].unique()))}
    dfY = df[class_name].map(class_name_map)
    df = pd.concat([dfX, dfY], axis=1)
    feature_names = list(dfX.columns)
    class_values = sorted(class_name_map)
    return df, feature_names, class_values

def get_features_map(feature_names, real_feature_names):
    features_map = defaultdict(dict)
    i = 0
    j = 0

    while i < len(feature_names) and j < len(real_feature_names):
        if feature_names[i] == real_feature_names[j]:
            features_map[j][feature_names[i]] = j
            i += 1
            j += 1
        elif feature_names[i].startswith(real_feature_names[j]):
            features_map[j][feature_names[i]] = j
            i += 1
        else:
            j += 1
    return features_map

In [7]:
res = prepare_dataset(df, class_name)
df, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map = res
df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Hours,Office_Hours,Working_day,Time_blocks_0,Time_blocks_1,Time_blocks_2,Time_blocks_3,Occupancy
0,23.7,26.272,585.2,749.2,0.004764,14,1,1,0,0,1,0,1
1,23.718,26.29,578.4,760.4,0.004773,14,1,1,0,0,1,0,1
2,23.73,26.23,572.666667,769.666667,0.004765,14,1,1,0,0,1,0,1
3,23.7225,26.125,493.75,774.75,0.004744,14,1,1,0,0,1,0,1
4,23.754,26.2,488.6,779.0,0.004767,14,1,1,0,0,1,0,1


In [8]:
df[class_name].value_counts()

0    15810
1     4750
Name: Occupancy, dtype: int64

In [9]:
rows2remove = np.random.choice(df[df[class_name] == 1].index, 4750 - 659,replace=False)

In [10]:
len(df), len(rows2remove)

(20560, 4091)

In [11]:
df2 = df.drop(index=rows2remove, axis=0)
print(len(df2))

16469


In [12]:
print('Dataset shape %s' % Counter(df2[class_name]))

Dataset shape Counter({0: 15810, 1: 659})
