In [None]:
import pandas as pd
import numpy as np

# Statistical libraries
from scipy import stats
from scipy.stats import skew
from scipy.stats import norm

# Plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt


# Preprocessing
from sklearn.preprocessing import StandardScaler

# Evaluation Procedures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

# Classification methods
from sklearn.naive_bayes import GaussianNB

# Evaluation Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

## Variable
Set values of some variables

In [None]:
SEED = 1234

## Read Data

In [None]:
all_data = pd.read_csv("train.csv")

In [None]:
print(len(all_data))

621300


In [None]:
all_data.head()

# Data Exploration

## Missing Values
Let's check how many missing values are in the data set and how can we deal with them.

In [None]:
all_data_na = (all_data.isnull().sum()).sort_values(ascending = False)
all_data_na = all_data_na.drop(all_data_na[all_data_na.values == 0].index)

Missing_data = pd.DataFrame({'Missing Numbers' :all_data_na})
print(Missing_data)

Empty DataFrame
Columns: [Missing Numbers]
Index: []


#### SO there are no Missing Values in the input data

## Distribution of Numerical Variables
We now explore the distribution of numerical variables. We will apply the log1p function to all the skewed numerical variables.

In [None]:
# take the numerical features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# compute the skewness but only for non missing variables (we already imputed them but just in case ...)
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))

skewness = pd.DataFrame({"Variable":skewed_feats.index, "Skewness":skewed_feats.values})
# select the variables with a skewness above a certain threshold

In [None]:
skewness = skewness.sort_values('Skewness', ascending=[0])

f, ax = plt.subplots(figsize=(8,6))
plt.xticks(rotation='90')
sns.barplot(x=skewness['Variable'], y=skewness['Skewness'])
plt.ylim(0,25)
plt.xlabel('Numerical Variables', fontsize=15)
plt.ylabel('Skewness', fontsize=15)
plt.title('', fontsize=15)

Let's apply the logarithmic transformation to all the variables with a skewness above a certain threshold (0.75). Then, replot the skewness of attributes. Note that to have a fair comparison the two plots should have the same scale.

In [None]:
skewed_feats = skewed_feats[skewed_feats > 0.75]
all_data[skewed_feats.index] = np.log1p(all_data[skewed_feats.index])

  
  


In [None]:
# compute the skewness but only for non missing variables (we already imputed them but just in case ...)
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewness_new = pd.DataFrame({"Variable":skewed_feats.index, "Skewness":skewed_feats.values})
# select the variables with a skewness above a certain threshold

skewness_new = skewness_new.sort_values('Skewness', ascending=[0])

f, ax = plt.subplots(figsize=(8,6))
plt.xticks(rotation='90')
sns.barplot(x=skewness_new['Variable'], y=skewness_new['Skewness'])
plt.ylim(0,25)
plt.xlabel('Numerical Variables', fontsize=15)
plt.ylabel('Skewness', fontsize=15)
plt.title('', fontsize=15)

# Utility Functions
Next we define some utility functions.

In [None]:
def PrintConfusionMatrix(model, true_y, predicted_y, positive=1, negative=-1):
    cm = confusion_matrix(true_y,predicted_y)
    print("\t"+str(model.classes_[0])+"\t"+str(model.classes_[1]))
    print(str(model.classes_[0]) + "\t",cm[0][0],"\t",cm[0][1])
    print(str(model.classes_[1]) + "\t",cm[1][0],"\t",cm[1][1])    

def PrintSignificance(stat, c):
    if (stat[1]<(1-c)):
        print("The difference is statistically significant (cf %3.2f p-value=%.4f)"%(c,stat[1]))
    else:
        print("The difference is not statistically significant (cf %3.2f p-value=%.4f)"%(c,stat[1]))        

# Data, Training, and Validation Sets
We load the data, define the input data X and the target column y. Next, we set the random seed, define a training/Validation partition, and the crossvalidation procedure we will use to compare the models.


In [None]:
target_variable = 'aircon_sum_target_next14d'
input_variables = all_data.columns[all_data.columns!=target_variable]

X = all_data[input_variables]
y = all_data[target_variable]


In [None]:
np.random.seed(SEED)

X_train, X_valid, y_train, y_valid = \
    train_test_split(X, y,\
    test_size= 1/4.0, random_state =SEED, shuffle=True)

crossvalidation = StratifiedKFold(n_splits=10, shuffle=True)


# Baseline Performance (Majority Voting)
At first, let's check what is the class distribution. As we can see the dataset is quite imbalanced with 99.4% of target data that have been classified as a not fault in the following 14 days with only 0.6% of the target data classified as fault. Thus, a very simple model classifying all the test data as not fault would reach an 99.4% accuracy (an impressive result in many applications) however, it would be useless for the real goal of this analysis, that is, to create a model to identify almost all faults.

In [None]:
print("Class %2d  %.1f%%\nClass %2d  %.1f%%\n"%((y.value_counts()/y.shape[0]).index[0],100*(y.value_counts()/y.shape[0]).values[0],(y.value_counts()/y.shape[0]).index[1],100*(y.value_counts()/y.shape[0]).values[1]))

Class  0  99.4%
Class  1  0.6%



# Model Evaluation
We now evaluate different models using some setup we investigated early. We will consider some basic methods (linear regression, naive bayes, and k-NN) as well as ensemble methods.