In [1]:
import numpy as np     
import pandas as pd 
import re 
import gzip  
import csv
import random   
import matplotlib.pyplot as plt     
from pandas.plotting import scatter_matrix
import seaborn as sns   

from sklearn.utils import resample

# Functions

## Returns the Distribution of Grade and Class of the Majority (Good) Class

In [2]:
def percentage_good_loans(dataframe):
    years = sorted(list(dataframe.issued_yr.unique()))
    grade = sorted(list(dataframe.grade.unique()))
    dist = {}
    for x in years:
        for y in grade:
            num = dataframe[dataframe['grade'] == y][dataframe['issued_yr']== x][dataframe['loan_status_n'] ==1].term.count()
            percentage = round(num / len(dataframe[dataframe['loan_status_n']==1].term),5)
            place = str(x) +'_'+ y
            dist[place] = percentage
    return dist

## Returns a DF of a Downsized Minority (Bad Loan) Class

In [3]:
#--- Since the data of just the bad loans within the original 2.2M dataset might still be too large, this function 
#--- allows us to downsize the minority class first



def pro_down_sample_minority(data, percent_of_data):
    dist = percentage_bad_loans(data)
    data_1 = data[data['loan_status_n'] == 0]
    years_ = list(data.issued_yr.unique())
    grade_ = list(data.grade.unique())
    sample_down_df = pd.DataFrame(columns = data.columns)
    #desired size of sample down size
    size_of_sample = percent_of_data*(data_1.shape[0])

    #print(size_of_sample)
    for x in years_:
        for y in grade_:
            year_class = str(x)+'_'+y
            year_class_prop = dist[year_class]
            target_num = int(size_of_sample* year_class_prop)
            #delete_later = size_of_sample* year_class_prop
            #print(str(target_num) + '_' + str(delete_later))
            temp_df = data_1[data_1['grade'] == y][data_1['issued_yr']== x]
            sample_temp_df = resample(temp_df,
                                     replace = False,
                                     n_samples = target_num,
                                     random_state=123)
            sample_down_df = pd.concat([sample_down_df,sample_temp_df])
    return sample_down_df


## Returns the Distribution of Grade and Class of the Minority (Bad loan) Class


In [4]:
def percentage_bad_loans(dataframe):
    years = sorted(list(dataframe.issued_yr.unique()))
    grade = sorted(list(dataframe.grade.unique()))
    dist = {}
    for x in years:
        for y in grade:
            num = dataframe[dataframe['grade'] == y][dataframe['issued_yr']== x][dataframe['loan_status_n'] ==0].term.count()
            percentage = round(num / len(dataframe[dataframe['loan_status_n']==0].term),5)
            place = str(x) +'_'+ y
            dist[place] = percentage
    return dist

## Returns a DF of a Downsized Majority (Good Loan) Class

In [6]:
#--- after downsizing the minority class, its shape[0] is now the target size of the dataframe of good loans we want



def pro_down_sample_combined_dict_minority(data, downsized_minority_size):
    dist = percentage_bad_loans(data)
    data_1 = data[data['loan_status_n'] == 1]
    years_ = list(data.issued_yr.unique())
    grade_ = list(data.grade.unique())
    sample_down_df = pd.DataFrame(columns = data.columns)
    #desired size of sample down size======= might have to change a little bit to match the # of bad loans
    #size_of_sample = percent_of_data*(data_1.shape[0])
    size_of_sample = downsized_minority_size
    for x in years_:
        for y in grade_:
            year_class = str(x)+'_'+y
            # this 'dist' needs to refer to the output of percentage(dataframe) assigned to 'dist' variable name
            year_class_prop = dist[year_class]
            target_num = int(size_of_sample* year_class_prop)
            temp_df = data_1[data_1['grade'] == y][data_1['issued_yr']== x]
            sample_temp_df = resample(temp_df,
                                     replace = False,
                                     n_samples = target_num,
                                     random_state=123)
            sample_down_df = pd.concat([sample_down_df,sample_temp_df])
    return sample_down_df

# Steps

In [None]:
#: 1. Downsize the minority class into a new dataframe and choose the percentage wanted from the original dataset
pro_down_sample_minority(data, percent_of_data)

#: 2. Downsize the majority class into a new dataframe, the new target size is the shape[0] of the previously create
# minority class. input data is the original dataset
pro_down_sample_combined_dict_minority(data, downsized_minority_size)


# Check

In [7]:
#: Distribution from original data set of bad loans is similair to distribution of downsized bad loans dataframe
orig_bad = percentage_bad_loans(original_dataframe)
new_bad = percentage_bad_loans(new_downsized_dataframe)
[a_i - b_i for a_i, b_i in zip(orig_bad.values(), new_bad.values())]

#: Distribution of downsized "good loans" matches the distribution of the "bad loans"
new_bad #distribution of class and loans for smaller "bad loan" dataset 
new_good = percentage_good_loans(new_downsized_dataframe) 
[a_i - b_i for a_i, b_i in zip(new_bad.values(), new_good.values())]



NameError: name 'original_dataframe' is not defined