In [None]:
# Creating the Plotter class and DataFrameTransform class - these will allow visualisations of the code 

import matplotlib.pyplot as plt

class Plotter:
    def visualise_nulls(self, data_frame):
        plt.figure(figsize=(10, 6))
        missing_vals = data_frame.isnull().sum()
        missing_vals[missing_vals > 0].sort_values().plot(kind='barh')
        plt.title('Missing Values per Column')
        plt.xlabel('Number of Missing Values')
        plt.ylabel('Columns')
        plt.show()



class DataFrameTransform:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def check_nulls(self):
        return self.data_frame.isnull().sum()

    def drop_columns(self, columns_to_drop):
        self.data_frame.drop(columns=columns_to_drop, inplace=True)

    def impute_missing_values(self, columns, strategy='median'):
        for col in columns:
            if self.data_frame[col].dtype == 'object':
                self.data_frame[col] = pd.to_datetime(self.data_frame[col], errors='coerce')
            else:
                if strategy == 'median':
                    self.data_frame[col].fillna(self.data_frame[col].median(), inplace=True)
                elif strategy == 'mean':
                    self.data_frame[col].fillna(self.data_frame[col].mean(), inplace=True)

    def identify_skewed_columns(self, threshold=0.5):
        numeric_columns = self.data_frame.select_dtypes(include=['float64', 'int64']).columns
        skewed_cols = []
        for col in numeric_columns:
            if abs(self.data_frame[col].skew()) > threshold:
                skewed_cols.append(col)
        return skewed_cols

In [None]:

connector = RDSDatabaseConnector('/Users/ptm/Desktop/customer loans/credentials.yaml')
connect_data = connector.extract_data_from_table("loan_payments")

df_transformer = DataFrameTransform(connect_data)
plotter = Plotter()

# Determining the NULLs in each column and then dropping columns with NULLs, and visualise removal
null_counts_before = df_transformer.check_nulls()
print("Null Counts Before Dropping Columns:")
print(null_counts_before)

# Dropping the colums with nulls 
columns_to_drop = null_counts_before[null_counts_before > 1000].index.tolist()
df_transformer.drop_columns(columns_to_drop)

# Imputing the missing values 
columns_to_impute = connect_data.columns[connect_data.isnull().any()].tolist()
df_transformer.impute_missing_values(columns_to_impute, strategy='median')

# Checking NULLs after operations and visualize
null_counts_after = df_transformer.check_nulls()
print("\nNull Counts After Imputation:")
print(null_counts_after)
plotter.visualise_nulls(connect_data)




In [None]:
class DataFrameTransform:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def identify_skewed_columns(self, threshold=0.5):
        numeric_columns = self.data_frame.select_dtypes(include=['float64', 'int64']).columns
        skewed_cols = []
        for col in numeric_columns:
            if abs(self.data_frame[col].skew()) > threshold:
                skewed_cols.append(col)
        return skewed_cols

    def reduce_skewness(self, columns):
        transformations = {}
        for col in columns:
            self.data_frame[col] = np.log1p(self.data_frame[col])
            transformations[col] = self.data_frame[col].copy()  
        return transformations

    def apply_transformations(self, transformations):
        for col, transform in transformations.items():
            self.data_frame[col] = transform



transformer = DataFrameTransform(connect_data)

# applying transformations to reduce skewness
skewed_columns = transformer.identify_skewed_columns(threshold=0.5)
transformations = transformer.reduce_skewness(skewed_columns)
transformer.apply_transformations(transformations)



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))

# Creating two separate subplots for 'Before Transformation' and 'After Transformation'
plt.subplot(1, 2, 1)
for col in skewed_columns:
    sns.histplot(transformer.data_frame[col], kde=True, label=col)
plt.title('Skewness Before Transformation')
plt.legend()

plt.subplot(1, 2, 2)
for col in skewed_columns:
    sns.histplot(transformations[col], kde=True, label=col)
plt.title('Skewness After Transformation')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
class Plotter:
    def visualise_data(self, data):
        sns.pairplot(data, diag_kind='kde')
    
    def plot_boxplots(self, data):
        numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
        plt.figure(figsize=(12, 6))
        for i, col in enumerate(numeric_columns, 1):
            plt.subplot(1, len(numeric_columns), i)
            sns.boxplot(y=data[col])
            plt.title(col)
        plt.tight_layout()
        plt.show()


plotter = Plotter()
plotter.plot_boxplots(connect_data) 



class DataFrameTransform:
    def __init__(self, data_frame):
        self.data_frame = data_frame

    def identify_outliers(self, columns, threshold=2.0):
        outliers = {}
        for col in columns:
            z_scores = (self.data_frame[col] - self.data_frame[col].mean()) / self.data_frame[col].std()
            col_outliers = self.data_frame[abs(z_scores) > threshold]
            outliers[col] = col_outliers
        return outliers
    
    def remove_outliers(self, outliers):
        for col, outlier_data in outliers.items():
            self.data_frame = self.data_frame[~self.data_frame.index.isin(outlier_data.index)]


plotter = Plotter()  
plotter.visualise_data(connect_data)  


In [None]:
# Above plotter function for box plot gave a cluttered layout - adjusting the organisation of the box plots

class Plotter:
    def visualise_data(self, data):
        sns.pairplot(data, diag_kind='kde')
    
    def plot_boxplots(self, data):
        numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
        num_plots = len(numeric_columns)
        num_rows = (num_plots // 3) + (num_plots % 3 > 0)  
        plt.figure(figsize=(15, 5 * num_rows))  
        
        for i, col in enumerate(numeric_columns, 1):
            plt.subplot(num_rows, 3, i)
            sns.boxplot(y=data[col])
            plt.title(col, fontsize=12)  
            plt.xlabel('')  

        plt.tight_layout()
        plt.show()

# Prints out a much more clearer box plots which are easy to read and use for project 
plotter = Plotter()
plotter.plot_boxplots(connect_data)  

In [None]:
# Check if the class is properly initialized and outlier_columns match DataFrame columns
print(transformer)
print(connect_data.columns)
print(outlier_columns)

transformer = DataFrameTransform(connect_data)

# Verifying that the method calls 
outliers = transformer.identify_outliers(outlier_columns)
print(outliers)

transformer.remove_outliers(outliers)


In [None]:

outlier_columns = ['loan_amount', 'funded_amount_inv', 'instalment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_accounts', 'total_accounts', 'out_prncp', 'out_prncp_inv', 'total_payment', 'total_payment_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_payment_amount', 'collections_12_mths_ex_med']

# Identifing outliers
outliers = transformer.identify_outliers(outlier_columns)

# Removing outliers
transformer.remove_outliers(outliers)


In [None]:
visualises the outliers 

plotter.visualise_data(transformer.data_frame) 

In [None]:
# computing a correlation matrix for the df 

numeric_columns = connect_data.select_dtypes(include=['float64', 'int64'])


correlation_matrix = numeric_columns.corr()

# Visualising the correlation matrix using a heatmap as suggested 
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.show()



In [None]:
# Setting a threshold -- used google for suggestion of a preferred threshold based on other data sets similar to this 
threshold = 0.7  

# Finding  columns with correlation above the threshold
highly_correlated_cols = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            highly_correlated_cols.add(colname)

# Printing highly correlated columns
print("Highly correlated columns:", highly_correlated_cols)

In [None]:
# Removing 'out_prncp_inv' from the data set 
connect_data_filtered = connect_data.drop(columns=['out_prncp', 'total_payment_inv'], axis=1)

In [None]:
#checking the the colums have been dropped 

print(connect_data_filtered.columns)