<a href="https://colab.research.google.com/github/Pawcio93/House-Prices-Kaggle-/blob/master/Data_Preparation_Library.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EXTERNAL LIBRARIES

In [0]:
### IMPORT LIBRARIES ###
# MATH
import math
# NUMPY
import numpy as np # linear algebra
# PANDAS
import pandas as pd # data processing
pd.set_option('display.max_rows', 1000)
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
# MATPLOTLIB
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
# SEABORN
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
# SCIPY
from scipy import stats
from scipy.stats import norm, skew #for some statistics
from scipy.special import boxcox1p
# WARNING
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)
# SKLEARN
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, make_scorer
# XGBOOST
from xgboost import XGBClassifier
from xgboost import XGBRegressor
# LIGTHGBM
from lightgbm import LGBMRegressor
# MODEL HELPERS
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
# DISPLAY
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display_html
# POWER PREDICTIVE SCORE
!pip install ppscore
import ppscore as pps
# TABULETE
!pip install tabulate
from tabulate import tabulate
# TENSORBOARD
%load_ext tensorboard
import tensorflow as tf
import datetime
# GRIDSEARCHCV
from sklearn.model_selection import GridSearchCV
# COLAB
from google.colab import files
from google.colab import drive

  import pandas.util.testing as tm




# NOTEBOOK SETTINGS

# TEST DATASET
### HOUSE PRICES DATASET (KAGGLE)

In [0]:
### IMPORT AND PREPARE DATASETS ###
dataset_train = pd.read_csv('train.csv')
dataset_test = pd.read_csv('test.csv')
data_description = open("data_description.txt", "r")

In [0]:
# Create a copy of train and test datasets
X_train = dataset_train.copy()
X_test = dataset_test.copy()
X_set = [X_train, X_test]
y_train = X_train['SalePrice']
sub_id = X_test['Id']

# CLASSES USED IN NOTEBOOK
### - Processing class
### - Data analysis class
### - Transform class


## PROCESSING CLASS
### - DataProcessing()
 - drop_vars_list () - Create a list of variables to drop, according to non values and overhelming frequency of one value in series

In [0]:
### PROCESSING CLASS ####
class DataProcessing(): 
  def __init__(self, X_train, X_test):
    self.X_train = X_train # Train set
    self.X_test = X_test # Test set
    self.drop_list = [] # List of irrelevant columns
    self.cleaning_list = [] # List of columns with nan values
    self.datasets = [self.X_train, self.X_test] # list of datasets

  def drop_vars_list(self, freq = 0.9, null_factor = 0.6):
    # Checking if there is a value representing over (freq) of the column 
    # freq - frequency of value from 0 to 1, 0.5 means that half of samples have one value
    # null_factor - parameter from 0 to 1, 0.5 means that half of samples have no value
    rows = self.X_train.shape[0]
    for column in self.X_train.columns:
        x = self.X_train[column].value_counts()
        frequence = x.iloc[0]/rows  
        if frequence >= freq:
            self.drop_list.append(column)
    # Checking what percentage are nan value, if over (null_factor) - drop
    for column in self.X_train.columns:
        x = self.X_train[column].isnull().sum()
        freq = x/rows
        if freq >= null_factor:
            self.drop_list.append(column)      
    return print('drop_list CREATED: '), self.drop_list

  def drop(self):
    # Drop columns from drop_list
    self.X_train.drop(self.drop_list, axis=1, inplace = True)
    self.X_test.drop(self.drop_list, axis=1, inplace = True)
    return print ("Columns removed")

  def cleaning_vars_list(self):
    # Create a list of columns which need to be cleaned (have nan values)
    for dataset in self.datasets:
        for column in dataset.columns:    
            if dataset[column].isnull().any() == True:
                if column not in self.cleaning_list:
                    self.cleaning_list.append(column)
    return print('cleaning_list CREATED: '), self.cleaning_list

  def fill_with_estimate_value(self, cleaned_column, coeff_column):
    # Clean column for datasets
    print('Train before: ',  self.X_train[cleaned_column].isnull().sum())
    print('Test before: ', self.X_test[cleaned_column].isnull().sum())
    # Change nan values to calculated values 
    for dataset in self.datasets:
        # Coefficient based on most correlated variable
        # - find most correlated numerical variable
        # - calculate correlation of vars means
        # - use this coefficient to estimate nan values
        coeff_1 = dataset[cleaned_column].mean()/dataset[coeff_column].mean()
        for i in range(0, len(dataset)):
            if np.isnan(dataset[cleaned_column][i]):
                dataset[cleaned_column][i] = round(dataset[coeff_column][i]*coeff_1, 0)
            else:
                continue
    print('Train after: ', self.X_train[cleaned_column].isnull().sum())
    print('Test after: ', self.X_test[cleaned_column].isnull().sum())

  def fill_with_most_common_value(self, column):  
    # Checking number of nan values
    print('Train before: ',  self.X_train[column].isnull().sum())
    print('Test before: ', self.X_test[column].isnull().sum())
    value = self.X_train[column].value_counts().idxmax()
    # Filling missing values with most common value
    for dataset in self.datasets:
        dataset[column].fillna(value, inplace = True)     
    print('Train after: ', self.X_train[column].isnull().sum())
    print('Test after: ', self.X_test[column].isnull().sum())

  def fill_with_median(self, column, dependent_column = 0, excluded_values = [], replace_excluded = 0):
    # Checking number of nan values
    # column - variable which we want to clean
    # dependent_column - variable which we are based on while choosing string values
    # excluded_values - list o values which should not be filled with median
    # replace_excluded - value to replace excluded values
    print('Train before: ',  self.X_train[column].isnull().sum())
    print('Test before: ', self.X_test[column].isnull().sum())
    # Filling missing values with median
    for dataset in X_set:
      if dependent_column == 0:
        # Classic fill with median
        dataset[column].fillna(dataset[column].median(),
                                inplace = True)
      else:
        # Fill with median, but with exeptions
        # example - column contain area of garage, in dependent column there is an
        # information that there is no garage, hence area of garage is 0 not median
        for i in dataset[dependent_column]:
          if i in excluded_values:
              dataset[column].fillna(replace_excluded, inplace = True)
          else:
              dataset[column].fillna(dataset[column].median(),
                                      inplace = True)
    print('Train after: ', self.X_train[column].isnull().sum())
    print('Test after: ', self.X_test[column].isnull().sum())

  def fill_with_mean(self, column, dependent_column = 0, excluded_values = [], replace_excluded = 0):
    # Checking number of nan values
    # column - variable which we want to clean
    # dependent_column - variable which we are based on while choosing string values
    # excluded_values - list o values which should not be filled with mean
    # replace_excluded - value to replace excluded values  
    print('Train before: ',  self.X_train[column].isnull().sum())
    print('Test before: ', self.X_test[column].isnull().sum())
    # Filling missing values with mean
    for dataset in X_set:
      if dependent_column == 0:
        # Classic fill with mean
        dataset[column].fillna(dataset[column].mean(),
                                inplace = True)
      else:
        # Fill with mean, but with exeptions
        # example - column contain area of garage, in dependent column there is an
        # information that there is no garage, hence area of garage is 0 not mean
        for i in dataset[dependent_column]:
          if i in excluded_values:
              dataset[column].fillna(0, inplace = True)
          else:
              dataset[column].fillna(dataset[column].mean(),
                                      inplace = True)
    print('Train after: ', self.X_train[column].isnull().sum())
    print('Test after: ', self.X_test[column].isnull().sum())

  def fill_with_strings(self, column, dependent_column, condition_list, value_list, last_value = 'NOT FILLED'):
    # filling nan values with prepared string values, based on another column
    # column - variable which we want to clean
    # dependent_column - variable which we are based on while choosing string values
    # condition_list - list of values from dependent_column, according to which we change nan values
    # value_list - list of string used to fill nan
    # last_value - if there is any nan not covered by list, fill it with this value
    # Checking number of nan values         
    print('Train before: ', self.X_train[column].isnull().sum())
    print('Test before: ', self.X_test[column].isnull().sum())
    # filling missing values based on chosen class
    self.X_train[column].fillna(1, inplace = True)
    self.X_test[column].fillna(1, inplace = True)
    for dataset in self.datasets: # go into dataset
      for value, condition in zip(value_list, condition_list):
        for i in range(0, len(dataset)): 
            if dataset[dependent_column][i] == condition :
                if dataset[column][i] == 1:
                    dataset[column][i] = value
            else:
                continue
    # check if there is any nan left, if yes then fill with last_value
    for dataset in self.datasets: # go into dataset
      for i in range(0, len(dataset)): 
          if dataset[column][i] == 1:
              dataset[column][i] = last_value
    print('Train after: ', self.X_train[column].isnull().sum())
    print('Test after: ', self.X_test[column].isnull().sum())

  def fill_with(self, column, value):
    # simple fill with value function
    # Checking number of nan values         
    print('Train before: ', self.X_train[column].isnull().sum())
    print('Test before: ', self.X_test[column].isnull().sum())
    # filling missing values with most common one
    self.X_train[column].fillna(value, inplace = True)
    self.X_test[column].fillna(value, inplace = True)
    print('Train after: ', self.X_train[column].isnull().sum())
    print('Test after: ', self.X_test[column].isnull().sum())

## DATA ANALYSIS CLASS

In [0]:
class DataAnalysis():
  def __init__(self, X_train, X_test):
    self.X_train = X_train # Train set
    self.X_test = X_test # Test set
  
  def multi_scatter(self,x_list, y):
    ## Create scatter plots for multiple variables
    ## 3 plots in a row
    ## NOW FINISHED
    sns.set(style='whitegrid', rc={"grid.linewidth": 0.2})
    sns.set_context("paper", font_scale=2)  
    for x in range(0, len(x_list)):
      if x == 0 or x % 3:
        chart = sns.pairplot(data=self.X_train,
        y_vars=[y],
        x_vars=[x_list[x], x_list[x+1], x_list[x+2]],
        height = 10)
        plt.xticks(rotation = 45)
        plt.show()
      else:
        continue

  # def multi_scatter_2(self,x_list, y):
  #   ## Create scatter plots for multiple variables
  #   ## 3 plots in a row
  #   ## NOT FINISHED
  #   sns.set(style='whitegrid', rc={"grid.linewidth": 0.2})
  #   sns.set_context("paper", font_scale=2)  
  #   for x in range(0, len(x_list)):
  #     if x == 0 or x % 3:
  #       chart = sns.pairplot(data=self.X_train,
  #       y_vars=[y],
  #       x_vars=[x_list[x], x_list[x+1], x_list[x+2]],
  #       height = 10)
  #       for ax in chart.axes.flat:
  #           labels = ax.get_xticklabels()
  #           ax.set_xticklabels(labels=labels, rotation=45)
  #       plt.show()
  #     else:
  #       continue

  def var_overview(self, var, target):
    # Checking var data type 
    numerical = ['int64', 'float64', 'int32', 'float32', 'int16', 'float16']
    other = ['object', 'str']
    if self.X_train[var].dtypes in numerical:
      # DATA TABLE
      (mu_v, sigma_v) = norm.fit(self.X_train[var])
      (mu_t, sigma_t) = norm.fit(self.X_train[target])
      headers = ['DATA', 'MEAN', 'STANDARD DEVIATION']
      table_data = [('Variable', mu_v, sigma_v),
                    ('Target', mu_t, sigma_t)]
      print(tabulate(table_data, headers=headers, tablefmt='grid', numalign='center'))
      # PLOTS
      fig, axs = plt.subplots(2, figsize = (10,20))
      # Scatter plot
      axs[0].scatter(self.X_train[var], self.X_train[target])
      axs[0].set_title('Scatter')
      axs[0].set_ylabel(target)
      axs[0].set_xlabel(var)
      # Histgram
      sns.distplot(self.X_train[var], ax=axs[1], fit=norm)
      axs[1].set_title('Histogram')
      axs[1].set_xlabel(var)
      for axs in fig.axes:
        plt.sca(axs)
        plt.xticks(rotation=45)
      # Probability plot
      fig2 = plt.figure(figsize = (10.30,5))
      res = stats.probplot(self.X_train[var], plot=plt)
    elif self.X_train[var].dtypes in other:
      # PLOTS
      fig, axs = plt.subplots(2, figsize = (10,20))
      # Scatter plot
      axs[0].scatter(self.X_train[var], self.X_train[target])
      axs[0].set_title('Scatter')
      axs[0].set_ylabel(target)
      axs[0].set_xlabel(var)
      # Boxplot
      sns.boxplot(x=var, y=target, data=self.X_train, ax=axs[1])
      axs[1].set_title('Boxplot')
      axs[1].set_ylabel(target)
      axs[1].set_xlabel(var)
      for axs in fig.axes:
        plt.sca(axs)
        plt.xticks(rotation=45)
    else:
      self.X_train[var].dtypes
      self.X_train[target].dtypes

## TRANSFORM CLASS

In [0]:
class TransformData():
  def __init__(self, X_train, X_test):
    self.X_train = X_train # Train set
    self.X_test = X_test # Test set
    self.datasets = [self.X_train, self.X_test] # list of datasets
    self.skewed_vars = []
    self.numerical_columns = []
    self.best_results = []

  def normality_test(self, target, skew_factor = 0.75):
    # Checking var data type 
    numerical = ['int64', 'float64', 'int32', 'float32', 'int16', 'float16']
    other = ['object', 'str']
    # Creating list of columns
    column_list = list(self.X_train.columns.values)
    # delete dependent variable
    column_list.remove(target)
    # Create list of numerical vars
    for ds in self.datasets:
      for column in column_list:
        if ds[column].dtypes in numerical:
          if column not in self.numerical_columns:
            self.numerical_columns.append(column)
          else:
            continue
        else:
          continue
      # Check the skew of numerical variable
      self.skewed_vars = ds[self.numerical_columns].apply(lambda x: skew(x))
      # Create dataframe of variables with their skew values
      self.skewness = pd.DataFrame({'Skew' :self.skewed_vars})
      self.skewness = self.skewness[abs(self.skewness) > skew_factor].dropna()
    print("Number of skewed variables {}".format(self.skewness.shape[0]))
    print(self.numerical_columns)

  def skew_fix(self, lmbda = 0.5):
    skews = self.skewness.index
    for ds in self.datasets:
      for skew in skews:
          # Apply transformation methods
          ds_boxcox = boxcox1p(ds[skew], lmbda)
          ds_log = np.log(ds[skew])
          ds_square = np.sqrt(ds[skew])
          # Check which method have the best result
          box_cox_skew = pd.Series(ds_boxcox).skew()
          log_skew = pd.Series(ds_log).skew()
          square_skew = pd.Series(ds_square).skew()
          results = [box_cox_skew,
                     log_skew,
                     square_skew]
          self.best_results.append(results.index(min(results)))
    for ds in self.datasets:          
      for best, skew in zip(self.best_results, skews):
          if best == 0:
            ds[skew] = boxcox1p(ds[skew], lmbda)
          elif best == 1:
            ds[skew] = np.log(ds[skew])
          else:
            ds[skew] = np.sqrt(ds[skew])

  def data_skalling(self):
    scaler = RobustScaler() 
    for ds in self.datasets:
      #  ds[self.numerical_columns] = scaler.fit_transform(ds[self.numerical_columns]) 
       ds[self.numerical_columns] = scaler.fit_transform(ds[self.numerical_columns]) 
