In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from collections import Counter

%matplotlib inline

In [2]:
df_raw = pd.read_csv('/Users/kriz/Downloads/creditcard.csv')
df=df_raw.drop(['Time'], axis=1)


In [3]:
input_feature_list = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11','V12', 'V13', 'V14', 'V15',
                'V16', 'V17', 'V18', 'V19', 'V20', 'V21','V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

In [6]:
def Std_method (df,n,features):
    """
    Takes a dataframe df of features and returns an index list corresponding to the observations 
    containing more than n outliers according to the standard deviation method.
    """
    #
    outlier_indices = []
    
    #loop through columns
    
    for i in features:
        
        # calculate the mean and standard deviation of the data frame
        data_mean = df[i].mean()
        data_std = df[i].std()
        
        # calculate the threshold value
        upper_threshold = data_mean + 3* data_std 
        lower_threshold = data_mean - 3* data_std
        
        #anything above upper threshold or  below  lower threshold is considered as anomaly
        
        # Determining a list of indices of outliers for feature column        
        outlier_list_column= df[(df[i] < lower_threshold) | (df[i] > upper_threshold)].index
        
        # appending the found outlier indices for column to the list of outlier indices 
        outlier_indices.extend(outlier_list_column)
        
    # selecting observations containing more than x outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    # Calculate the number of records below and above lower and above bound value respectively
    df1 = df[df[i] > upper_threshold]
    df2 = df[df[i] < lower_threshold]
    print('Total number of outliers is:', df1.shape[0]+ df2.shape[0])
    
    return multiple_outliers   

# detecting outliers
Outliers_Std_Dev = Std_method(df,1,input_feature_list)

# dropping outliers
df_out2 = df.drop(Outliers_Std_Dev, axis = 0).reset_index(drop=True)
        

Total number of outliers is: 4076


In [7]:
def z_score_method (df,n,features):
    """
    Takes a dataframe df of features and returns an index list corresponding to the observations 
    containing more than n outliers according to the z-score method.
    """
    outlier_list = []
    
    for i in features:
        # calculate the mean and standard deviation of the data frame
        data_mean = df[i].mean()
        data_std = df[i].std()
        threshold = 3
        
        z_score = abs( (df[i] - data_mean)/data_std )
        
        # Determining a list of indices of outliers for feature column        
        outlier_list_column =  df[z_score > threshold].index
        
        # appending the found outlier indices for column to the list of outlier indices 
        outlier_list.extend(outlier_list_column)
        
    # selecting observations containing more than x outliers
    outlier_list = Counter(outlier_list)        
    multiple_outliers = list( k for k, v in outlier_list.items() if v > n )
    
    # Calculate the number of outlier records
    df1 = df[z_score > threshold]
    print('Total number of outliers is:', df1.shape[0])
    
    return multiple_outliers

# detecting outliers
Outliers_z_score = z_score_method(df,1,input_feature_list)

# dropping outliers
df_out3 = df.drop(Outliers_z_score, axis = 0).reset_index(drop=True)

Total number of outliers is: 4076


In [8]:
from scipy.stats import median_abs_deviation

def mod_z_score_method (df,n,features):
    """
    Takes a dataframe df of features and returns an index list corresponding to the observations 
    containing more than n outliers according to the z-score modified method.
    """
    outlier_list = []
    
    for i in features:
        
        # calculate the mean and standard deviation of the data frame
        data_mean = df[i].mean()
        data_std  = df[i].std()
        
        #set a threshold value 
        threshold = 4
        MAD       = median_abs_deviation  #median of (input data - deviation)
        
        mod_z_score = abs(0.6745*(df[i] - data_mean)/MAD(df[i]) )
                
        # Determining a list of indices of outliers for feature column        
        outlier_list_column =  df[mod_z_score >threshold].index
        
        # appending the found outlier indices for column to the list of outlier indices 
        outlier_list.extend(outlier_list_column)
        
    # selecting observations containing more than x outliers
    outlier_list = Counter(outlier_list)        
    multiple_outliers = list( k for k, v in outlier_list.items() if v > n )
    
    
     # Calculate the number of outlier records
    df1 = df[mod_z_score >threshold]
    print('Total number of outliers is:', df1.shape[0])
    
    return multiple_outliers
    
# detecting outliers
Outliers_z_score = mod_z_score_method(df,1,input_feature_list)

# dropping outliers
df_out4 = df.drop(Outliers_z_score, axis = 0).reset_index(drop=True)
    

Total number of outliers is: 27828


In [9]:
def IQR_method (df,n,features):
    """
    Takes a dataframe and returns an index list corresponding to the observations 
    containing more than n outliers according to the  IQR method.
    """
    outlier_list = []
    
    for i in features:
                
        # 1st quartile (25%)
        
        Q1 = df[i].quantile(0.25)
        
        # 3rd quartile (75%)
       
        Q3 = df[i].quantile(0.75)
        
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        upper_threshold = Q3 + 1.5 * IQR
        lower_threshold = Q1 - 1.5 * IQR
        
        # Determining a list of indices of outliers
        outlier_list_column = df[(df[i] <lower_threshold) | (df[i] >upper_threshold)].index
        
        # appending the list of outliers 
        outlier_list.extend(outlier_list_column)
        
    # selecting observations containing more than x outliers
    outlier_list = Counter(outlier_list)        
    multiple_outliers = list( k for k, v in outlier_list.items() if v > n )
    
    # Calculate the number of records below and above lower and above bound value respectively
    df1 = df[df[i] < lower_threshold]
    df2 = df[df[i] > upper_threshold]
    
    print('Total number of outliers is:', df1.shape[0]+df2.shape[0])
    
    return multiple_outliers

# detecting outliers
Outliers_IQR = IQR_method(df,1,input_feature_list)

# dropping outliers
df_out = df.drop(Outliers_IQR, axis = 0).reset_index(drop=True)

Total number of outliers is: 31904
