In [None]:
# Creating a function to return missing values in a dataframe

def Percent_Missing(dataframe):
    missing_count = dataframe.isnull().sum()
    percent_missing = dataframe.isnull().sum() * 100 / len(dataframe)
    missing_value_df = pd.DataFrame({'column_name': dataframe.columns,
                                     'missing_count': missing_count,
                                     'percent_missing': percent_missing})
    print(missing_value_df)

In [None]:
# Function to change datatype of the column as desired

def Change_Data_Type(desired_type, dataframe, columns_list=[]):
    for column in columns_list:
        dataframe[column]= dataframe[column].astype(desired_type)

In [None]:
# Creating a class which compiles functions for Exploratory Data Analysis Steps which are repeated everytime

class EDA():
    
    def __init__(self, dataframe):
        self.dataframe = dataframe
    
    # Function to check number of missing values in a dataframe

    def Percent_Missing(self):
        missing_count = self.dataframe.isnull().sum()
        percent_missing = self.dataframe.isnull().sum() * 100 / len(self.dataframe)
        missing_value_df = pd.DataFrame({'column_name': self.dataframe.columns,
                                         'missing_count': missing_count,
                                         'percent_missing': percent_missing})
        print(missing_value_df)
    
    
    
    # Function to give box plots and kde plots of int and float data (numeric data) and countplot of categorical data
    def VisualizeDistributions(self):
        for column in list(self.dataframe.columns):
            try:
                # If the data type of the variable is object, the loop will print the result

                if self.dataframe[column].dtype != 'O':
                    plt.title(f'Box Plot of {column} Variable')
                    sns.boxplot(x=f'{column}', data=self.dataframe)
                    plt.show()
                    plt.title(f'KDE Plot of {column} Variable')
                    sns.kdeplot(x=f'{column}', data=self.dataframe)
                    plt.show()

                # IF the data type is categorical, the function will create countplot showing frequency disribution
                else:
                    plt.title(f'Count Plot of {column} Variable')
                    sns.countplot(x=f'{column}', data=self.dataframe)
                    plt.show()
            except:
                print(f'**Value error at variable {column}**')
                
    # Function to remove outliers and mark the outlier values as null values which can be dropped later on
    def RemoveOutliers(self, dataframe):
            for x in dataframe.columns:
                if dataframe[x].dtype != 'O':
                    q75,q25 = np.percentile(dataframe.loc[:,x],[75,25])
                    intr_qr = q75-q25

                    max = q75+(1.5*intr_qr)
                    min = q25-(1.5*intr_qr)

                    dataframe.loc[dataframe[x] < min,x] = np.nan
                    dataframe.loc[dataframe[x] > max,x] = np.nan
            return dataframe
    
    # Function which will get us number of unique values in categorical variables
    def Cardinality(self): 
            for column in list(self.dataframe.columns):
                try:
                    # If the data type of the variable is object, the loop will print the result
                    if self.dataframe[column].dtype == 'O':
                        print(f'Cardinality of {column} variable >> {self.dataframe[column].nunique()}')
                
                except:
                    print(f'**Value error at variable {column}**')
    
    # Function for Standard Scaling using Min Max Scaler
    def StandardScaler(self, df, target_variable):
    # Using min max scaler package of sklearn library to feature scale our data

        from sklearn.preprocessing import MinMaxScaler

        scaler = MinMaxScaler()
        features = df.drop(columns=target_variable)
        scaler.fit(features)
        feature_scaled_variables_array = scaler.transform(features)
        return feature_scaled_variables_array
    
    def RemoveOutliers(self, dataframe):
        '''Any datapoint above or below 1.5 times the Inter-Quartile range of a numeric variable will be marked as null value'''
        for x in dataframe.columns:
            if dataframe[x].dtype != 'O':
                q75,q25 = np.percentile(dataframe.loc[:,x],[75,25])
                intr_qr = q75-q25

                max = q75+(1.5*intr_qr)
                min = q25-(1.5*intr_qr)

                dataframe.loc[dataframe[x] < min,x] = np.nan
                dataframe.loc[dataframe[x] > max,x] = np.nan
        return dataframe

In [None]:
# Creating a function to reduce the cardinality of the variables by defining thresholds
# Through this function we try to capture as many values of varibles as they can fit in our proportion thresholds
# The left out values are marked under one single value as 'All Others'
# This is done to reduce the cardinality of the features and make ML excercise more efficient and understandable


from collections import Counter

def cumulatively_categorise(column,threshold=0.75,return_categories_list=True):
  #Find the threshold value using the percentage and number of instances in the column
  threshold_value=int(threshold*len(column))
  #Initialise an empty list for our new minimised categories
  categories_list=[]
  #Initialise a variable to calculate the sum of frequencies
  s=0
  #Create a counter dictionary of the form unique_value: frequency
  counts=Counter(column)

  #Loop through the category name and its corresponding frequency after sorting the categories by descending order of frequency
  for i,j in counts.most_common():
    #Add the frequency to the global sum
    s+=dict(counts)[i]
    #Append the category name to the list
    categories_list.append(i)
    #Check if the global sum has reached the threshold value, if so break the loop
    if s>=threshold_value:
      break
  #Append the category Other to the list
  categories_list.append('Other')

  #Replace all instances not in our new categories by Other  
  new_column=column.apply(lambda x: x if x in categories_list else 'All Others')

  #Return transformed column and unique values if return_categories=True
  if(return_categories_list):
    return new_column,categories_list
  #Return only the transformed column if return_categories=False
  else:
    return new_column

# Code Credits: https://towardsdatascience.com/dealing-with-features-that-have-high-cardinality-1c9212d7ff1b