# Import Libraries

In [None]:
# importing necessary libraries, modules and functions
from __future__ import unicode_literals


from Functions.Validations.Validation import summary,LivePD_Validation, Vintage_Validation,Score_Binning_Validation
from Functions.General.FilesFlow import make_directory, current_path_and_path_list
from Functions.General.Module import WoeAnalysis, WoE_Binning, CreditScoring
from Functions.Validations.Validation import cutoff, cutoff_plot
from Functions.Visualisation.Heatmap import correlation_heatmap
from Functions.Visualisation.Lineplot import Lineplot


from sklearn.metrics import roc_auc_score, classification_report
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import pickle
import dill


# suppresses all warnings generated by the Python warnings module. 
warnings.filterwarnings('ignore')

# set the maximum number of columns to be displayed when printing a DataFrame to None,
# pandas will display all columns of the DataFrame without truncating or hiding any columns
pd.set_option('display.max_columns', None)

# Path Control & Global Variables

In [None]:
# Obtaining the current path and list of directories in the path
curr_path, path_parts = current_path_and_path_list()

# Defining the path of the raw data to be imported
raw_dir = str(curr_path.parents[2]) + "\\Data\\Raw Data\\"

# Defining the path where processed data will be saved
proc_dir = make_directory(raw_dir[:-14] + "Data\\Scorecards\\" + path_parts[-2] + "\\Processed Data\\")
res_dir = make_directory(raw_dir[:-14] + "Data\\Scorecards\\" + path_parts[-2] + "\\Results\\")
res_data_plt_dir = make_directory(raw_dir[:-14] + "Data\\Scorecards\\" + path_parts[-2] + "\\Results\\Data (Plot)\\")
res_data_exl_dir = make_directory(raw_dir[:-14] + "Data\\Scorecards\\" + path_parts[-2] + "\\Results\\Data (Excel)\\")
res_data_pkl_dir = make_directory(raw_dir[:-14] + "Data\\Scorecards\\" + path_parts[-2] + "\\Results\\Data (Pickle)\\")
model_dir = make_directory(raw_dir[:-14] + "Data\\Scorecards\\" + path_parts[-2] + "\\Model Flow\\")

## Import Data

In [None]:
# reading pandas DataFrame from a pickle file 
df = pd.read_pickle(proc_dir + "Data S2.pkl")
PDLive = pd.read_pickle(raw_dir + "PDLive.pkl")
allApproved_df = pd.read_pickle(proc_dir + "Data S2 Approved.pkl")
Rejected_df = pd.read_pickle(proc_dir + "Data S2 Rejected.pkl")

## bad client rates

In [None]:
# calculating the total number of records where "LoanId" not NaN
total = df[~df["LoanId"].isna()]["Actual"].value_counts()[1] + df[~df["LoanId"].isna()]["Actual"].value_counts()[0]


# calculating the percentage of bad customers
bads = round((df[~df["LoanId"].isna()]["Actual"].value_counts()[0]/total) * 100,2)


# printing values of good & bad and percentage of bad
print(f"""Good: {df[~df['LoanId'].isna()]['Actual'].value_counts()[1]}
Bad: {df[~df['LoanId'].isna()]['Actual'].value_counts()[0]}
Weight of bad customers is:  {bads}%  """)


## Split the data

In [None]:
# dropping actual column from dataframe
X = df.drop(columns=['Actual'])

# taking only actual column from dataframe 
y = df['Actual']

# intialising WoeAnalysis class
woe_analysis = WoeAnalysis()

# split dataset into train/test parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,random_state=42)

## Discrete Features

### Purpose

In [None]:
woe_analysis.discrete(column="Purpose", df=X_train, target=y_train).plot()

### RejectReasonName

In [None]:
woe_analysis.discrete(column="RejectReasonName", df=X_train, target=y_train).plot()

### PayTimeCategory

In [None]:
woe_analysis.discrete(column="PayTimeCategory", df=X_train, target=y_train).plot()

### MaritalStatus

In [None]:
woe_analysis.discrete(column="MaritalStatus", df=X_train, target=y_train).plot()

### RiskGrade

In [None]:
woe_analysis.discrete(column="RiskGrade", df=X_train, target=y_train).plot()

### ActiveHistory_PreviousPdSum

In [None]:
woe_analysis.discrete(column="ActiveHistory_PreviousPdSum", df=X_train, target=y_train).plot()

<br><br><br><br>
## Continuous Features

### Last12Months

In [None]:
bins = pd.IntervalIndex.from_tuples([(-50,-0.1),(-0.1, 1), (1, 2),(2,3),(3,4),(4,5),(5,6),(6,8),(8,11),(11,15),(15,20),(20,30),(30, np.inf)])
woe_analysis.continuous(column="Last12Months", bins= bins,df=X_train, target=y_train).plot(rotation=90)

### RefinanceRate

In [None]:
bins = pd.IntervalIndex.from_tuples([(-1,0),(0, 0.2), (0.2,0.35), (0.35, 0.45),(0.45, 0.55), (0.55, 0.65),(0.65, np.inf)])
woe_analysis.continuous(column="RefinanceRate", bins= bins,df=X_train, target=y_train).plot(rotation=90)

### RejectedAppCount

In [None]:
bins = pd.IntervalIndex.from_tuples([(-50,0),(0,1),(1,2),(2,3),(3,np.inf)])
woe_analysis.continuous(column="RejectedAppCount", bins= bins,df=X_train, target=y_train).plot(rotation=90)

<br><br><br><br><br><br><br><br>
# Export data for stage 2

In [None]:
# create a DataFrame with a single column named "name" containing values extracted from WoE_dict
# for each item in WoE_dict, split the string at ":" and take the first part (before ":")
# convert the resulting list into a DataFrame and extract unique values from the "name" column
# convert the unique values back into a list
WoE_dict = woe_analysis.WoE_dict

# names of variables
names = list(pd.DataFrame({"name":[i.split(":")[0] for i in WoE_dict ]})["name"].unique())

# Coreelations

In [None]:
# taking the dataframe filtered with variables necessary for model 
X = X[names]
X_train = X_train[names]

woe_transform = WoE_Binning(WoE_dict= WoE_dict, Production=False)
X_transformed = woe_transform.transform(X)

correlation_heatmap(X_transformed, target = y)
sizz = X_transformed.shape[1]

# Model Evaluateing

In [None]:

def evaluate_model(pipeline):
    # pipeline: there is passes pipeline
    ''' function that evaluates models performance '''
    def plot_cm(y_true, y_pred):
        # y_true: actual value of target (type: series)
        # y_pred: predicted value of target (type: series)


        # Calculate the confusion matrix
        cm = confusion_matrix(y_true, y_pred)


        # Create a DataFrame from the confusion matrix
        df_cm = pd.DataFrame(cm)

        # seaborn setting style axis names and plotting
        sns.set(font_scale=1.4)
        sns.heatmap(df_cm, annot=True, annot_kws={"size":16}, fmt='g', cmap='Blues')
        plt.title('CPA Consumer\n')
        plt.xlabel("Predicited")
        plt.ylabel("True")
        plt.show()

    # prediction
    y_pred = pipeline.predict(X_test)


    # invert the predicted labels and true labels (changing 0s to 1s and 1s to 0s)
    y_pred, y_test_hat = [1^x for x in y_pred], [1^x for x in y_test]


    # Calculate accuracy, ROC-AUC, and Gini coefficient
    acc, roc_auc = accuracy_score(y_test_hat, y_pred), roc_auc_score(y_test_hat, y_pred)


    print(f"Accuracy: {acc:.4f}, ROC-AUC: {roc_auc:.4f}, gini: {2*roc_auc-1:.4f}\n")
    print(classification_report(y_test_hat, y_pred))
    plot_cm(y_test_hat, y_pred)
    class_report = classification_report(y_test_hat, y_pred, output_dict=True)

    print(f'''
საერთო აკურატულობა: {acc*100:.3f} %
რეალურად დაიფარა {class_report["0"]["support"]}-ი სესხი
რეალურად გაფუჭდა  {class_report["1"]["support"]}-ი სესხი

მოდელმა დააპროგნოზა {class_report["0"]["recall"]*class_report["0"]["support"] + (1-class_report["1"]["recall"])*class_report["1"]["support"]} სესხის დაფარვა, საიდანაც რეალურად დაფარული იყო {str(round(class_report["0"]["precision"], 4)*100)[0:5]} % ანუ {class_report["0"]["recall"]*class_report["0"]["support"]} სესი.

მოდელმა დააპროგნოზა {class_report["1"]["recall"]*class_report["1"]["support"] + (1-class_report["0"]["recall"])*class_report["0"]["support"]} სესხის გაფუჭება, საიდანაც რეალურად გაფუჭებული იყო {str(round(class_report["1"]["precision"], 4)*100)[0:5]} % ანუ {class_report["1"]["recall"]*class_report["1"]["support"]} სესი.

რეალურად გაფუჭებული სესხებიდან მოდელმა სწორად აიდენტიფიცირა {str(round(class_report["1"]["recall"], 4)*100)[0:5]} %

რეალურად დაფარული სესხებიდან მოდელმა სწორად აიდენტიფიცირა {str(round(class_report["0"]["recall"], 4)*100)[0:5]} %



precision:  What proportion of positive identifications was actually correct?
recall:     What proportion of actual positives was identified correctly?

precision:  იდენტიფიცირებულების რა წილი იყო რეალურად სწორი?
recall:     რეალურად სწორის რა წილი იყო სწორად იდენტიფიცირებული?

            ''')


# Model

In [None]:
#  creating a Logistic Regression model with specified parameters
model = LogisticRegression(max_iter=1_000, class_weight='balanced', C=0.1)

# creating a pipeline consisting of the WoE transformation step followed by the Logistic Regression model
pipeline = Pipeline(steps=[('woe', woe_transform), ('logistic regression',model)])


# train the model
pipeline.fit(X_train, y_train)
evaluate_model(pipeline)


In [None]:
# once validation is complete, retrain the model on full data
pipeline.fit(X,y)
Probability = pipeline.predict_proba(X)

In [None]:
# Example usage
scoring = CreditScoring(data=df, model=model, WoE_dict=WoE_dict, WoE_Binning=WoE_Binning, production=True)


# allApproved_df = scoring.apply(allApproved_df).data
# Rejected_df = scoring.apply(Rejected_df).data

temp_df = scoring.apply(df)
df = temp_df.data
scorecard = temp_df.scorecard


with open(model_dir + 'Step 3 (ScoreCard).pkl', 'wb') as file:
    pickle.dump(scoring, file)

In [None]:
def cutoff_plot(data):
    """
    Generates a line plot of the approval rate over the bad rate using a specified cutoff range.

    Args:
    -----
    data : pandas.DataFrame
        A DataFrame containing columns 'Actual' and 'Scores' which are used for calculating 
        the approval and bad rates.

    Returns:
    --------
    matplotlib.figure.Figure
        A line plot visualizing the approval rate over the bad rate.
    """
    # initializing empty list for storing results
    results = []

    # iterating over the specified range and appending results
    for j in range(20, 100, 5):
        approve_rate = cutoff(Data=data[["Actual", "Scores"]], Approved_Rate=j, Display=False)
        results.append({
            'Approve Rate': approve_rate[-1],
            'Bad Rate': approve_rate[-2]
        })

    # converting the list of dictionaries into a DataFrame
    results = pd.DataFrame(results)

    # plotting
    return  Lineplot(Data=results, X_value='Bad Rate', Y_values=['Approve Rate'], Y_labels=['Model Approved'], Title="Approve Rate Over Bad Rate",figsize=(12, 5))



## Validation 

In [None]:
# determine the cutoff score to achieve a desired approval rate and display the result
cutoff_plot(df[["Actual","Scores"]])
approve_rate = cutoff(Data=df[["Actual","Scores"]],Approved_Rate=87, Display=True)

In [None]:

summary(data = df, cutoff_score= approve_rate[0])

In [None]:
summary(data = df, cutoff_score= approve_rate[0], PDD=30)

In [None]:
# plotting risk for 6 months 
result = LivePD_Validation(Model_data = allApproved_df, PD_Live = PDLive, cutoff_score = approve_rate[0],result_path = res_data_plt_dir,  Title = "Whole History PD30 SC1")
Lineplot(Data=result, X_value='MonthlyDate', Y_values=['Live30PercentA','Live30PercentB','Live30PercentF'], Y_labels=['Model Approved','Model Rejected','Without Model'],path=res_data_plt_dir, Title="Whole History PD30",figsize=(12, 5))


# plotting risk for full data
allApproved_data = allApproved_df[allApproved_df["LoanValueDate"] >= '2023-05-01']
result = LivePD_Validation(Model_data = allApproved_data, PD_Live = PDLive, cutoff_score = approve_rate[0])
Lineplot(Data=result, X_value='MonthlyDate', Y_values=['Live30PercentA','Live30PercentB','Live30PercentF'], Y_labels=['Model Approved','Model Rejected','Without Model'], path=res_data_plt_dir, Title="Last Year PD30 new",figsize=(12, 5))


# vintage ????
result = Vintage_Validation(Model_data=allApproved_df, cutoff_score=approve_rate[0], result_path = res_data_plt_dir, Title = "Whole History Vintage SC1")
Lineplot(Data=result, X_value='AppRegisterDate', Y_values=["Model","Rejected","TotalRisk"], Y_labels=['Model Approved','Model Rejected','Without Model'], path=res_data_plt_dir, Title="Whole History Vintage",figsize=(12, 5))



In [None]:
# score binning by scores
result = Score_Binning_Validation(df=allApproved_df, bins=30, path=res_data_plt_dir, binning_type=1)
Lineplot(Data=result, X_value='RowNumber', Y_values=["Risk"], Y_labels=['Risk'], path=res_data_plt_dir,Title='score binning for scores',figsize=(12, 5))
result

In [None]:
# score binning by percentiles
result = Score_Binning_Validation(df=allApproved_df, bins=30, path=res_data_plt_dir, binning_type=2, title='idk')
Lineplot(Data=result, X_value='RowNumber', Y_values=["Risk"], Y_labels=['Risk'],figsize=(12, 5))
result

In [None]:
# WoE_dict = woe_analysis.WoE_dict
# Variable_Ranges = woe_analysis.Variable_Ranges
# Variable_types = woe_analysis.Variable_types
# IV_dict = woe_analysis.IV_dict
# IV_excel = woe_analysis.IV_excel
# 
# 
# df.to_pickle(res_data_pkl_dir + f"Approved Scores with data.pkl")
# allApproved_df.to_pickle(res_data_pkl_dir  + f"Whole Approved Scores with data.pkl")
# Rejected_df.to_pickle(res_data_pkl_dir  + f"Rejected Data Approved by Model.pkl")
# 
# 
# 
# 
# scorecard.to_excel(res_dir + f"Scorecard.xlsx", index=False)
# df.to_excel(res_data_exl_dir + f"Approved Scores with data.xlsx", index=False)
# IV_excel.to_excel(res_dir + f"Manual Final IV.xlsx", index=False)
# allApproved_df.to_excel(res_data_exl_dir + f"Whole Approved Scores with data.xlsx")
# Rejected_df.to_excel(res_data_exl_dir + f"Rejected Data Approved by Model.xlsx", index=False)
# 
# 
# 
# 
# # Save the variables directly to a .pkl file
# with open(model_dir + 'Variables.pkl', 'wb') as file:
#     dill.dump({'WoE_dict': WoE_dict, 'Variable_Ranges': Variable_Ranges, 'Variable_types': Variable_types, 'IV_dict': IV_dict, 'IV_excel': IV_excel}, file)
