In [None]:
#the goal of this project is to classify a person's eligibility for a loan using the credit score as a means of classification 
#import the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
%config InlineBackend.figure_format='retina'

#warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
#loading the dataset
df = pd.read_csv("C:\\Users\\HP\\Downloads\\Credit-Score-Data\\Credit Score Data\\train.csv")
df.head()

In [None]:
#check the shape of the dataframe
df.shape

In [None]:
#info on the dataframe
df.info()

In [None]:
#check for null values
df.isna().sum()

In [None]:
df.describe().T

In [None]:
#dropping the ID, Customer_ID and Name columns as they are not needed in training the model
df.drop(['ID','Customer_ID','Name'],axis=1,inplace=True)


In [None]:
#visualize the relationship between each feature and the credit_score column
plt.figure(figsize=(15,10))
i = 1
for column in df:
    if df[column].dtypes in ('int64','float64'):
        plt.subplot(14,2,i)
        fig=px.box(df,
                x='Credit_Score',
                y=column,
                color='Credit_Score',
                title = "Credit score based on " + str(column),
                color_discrete_map={'Poor':'Red','Standard':'Yellow','Good':'green'})
        i += 1
        fig.update_traces(quartilemethod='exclusive')
        fig.show()
#plt.show()

In [None]:
#ANALYSIS FROM THE BOXPLOT...

#month has little effect
#age has little effect
#ssn has no effect
#higher annual income gives a better credit score
#higher monthly in-hand salary gives a better credit score
#lesser number of bank accounts increases the credit score
#lesser number of credit cards increases the credit score
#smaller interest rates attract higher credit score
#small number of loans increases credit score
#shorter delays from due date gives a higher credit score
#shorter number of delayed payments increases credit score
#smaller changed credit limit increases credit score
#smaller number of credit inquiries increases credit score
#smaller outstanding debts gives better credit score
#credit utilization ratio has no effect
#higher history age increases credit score
#total emi has no effect
#more amount invested monthly translates to higher credit score
#higher monthly balance gives more credit score

In [None]:
#drop the columns that have no effect on credit score
df1 = df.drop(['Month','Age','SSN','Credit_Utilization_Ratio','Total_EMI_per_month'],axis=1)
df1.head()

In [None]:
#check relationship between occupation and credit score
df['Occupation'].unique()

In [None]:
plt.figure(figsize=(20,15))
sns.barplot(x=df['Occupation'],y='Annual_Income',hue='Credit_Score',data=df)
plt.xticks(rotation=90)
plt.show()

In [None]:
#ANALYSIS OF BAR PLOT..

#Annual income has an effect on credit score as we saw earlier but the occupation has no effect on credit score
#drop the occupation column

df2 = df1.drop("Occupation",axis=1)

In [None]:
df2.describe(include='object')

In [None]:
df2['Type_of_Loan'].unique()

In [None]:
#drop type of loan
df2.drop('Type_of_Loan',axis=1,inplace=True)

In [None]:
df2.Credit_Mix.unique()

In [None]:
#convert the credit mix to integer
dummy = pd.get_dummies(df2['Credit_Mix'],drop_first=True)
df3 = pd.concat([df2,dummy],axis=1)

In [None]:
#now drop the credit mix column
df3.drop('Credit_Mix',axis=1,inplace=True)
df3.head()

In [None]:
df3['Payment_of_Min_Amount'].unique()

In [None]:
pd.crosstab(df3['Payment_of_Min_Amount'],df3['Credit_Score'])

In [None]:
dummy1 = pd.get_dummies(df3['Payment_of_Min_Amount'],drop_first=True)
df4 = pd.concat([df3,dummy1],axis=1)

In [None]:
#drop minimum payment and payment behavior
df5 = df4.drop(['Payment_of_Min_Amount','Payment_Behaviour'],axis=1)

In [None]:
#let's see the required columns
df5.columns

In [None]:
#remove outliers
for column in df5:
    if df5[column].dtypes in ('int64','float64'):
        df5[column] = df5[column].clip(lower=df5[column].quantile(0.25),upper=df5[column].quantile(0.75))

In [None]:
#visualize for outliers
plt.figure(figsize=(15,10))
i = 1
for column in df5:
    if df5[column].dtypes in ('int64','float64'):
        plt.subplot(6,3,i)
        sns.boxplot(df5[column])
        i += 1
plt.show()
    

In [None]:
#monthly balance and outstanding balance still has outliers
def remove_out(df):
    for column in df:
        if column in ('Monthly_Balance','Outstanding_Balance'):
            Q3 = df[column].quantile(0.75)
            Q1 = df[column].quantile(0.25)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5*IQR
            upper_bound = Q3 + 1.5*IQR
            df[column]=df[column].apply(lambda x: lower_bound if x<lower_bound else upper_bound if x>upper_bound else x)
    return df

In [None]:
df6 = remove_out(df5)
df6.head()

In [None]:
#split dataset into features and target
X  = np.array(df6.drop('Credit_Score',axis=1))
y  = np.array(df6['Credit_Score'])



In [None]:
#scale the x set
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

In [None]:
#split into train and test
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.2,random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import KFold





In [None]:
lr = LogisticRegression(solver='liblinear',multi_class='auto')
model = lr.fit(X_train,y_train)
print(model.score(X_test,y_test))

sc = SVC(gamma='auto',kernel='linear',C=10)
mp = sc.fit(X_train,y_train)
print(mp.score(X_test,y_test))

rf = RandomForestClassifier(criterion='gini',n_estimators=5,max_depth=3)
rfclf = rf.fit(X_train,y_train)
print(rfclf.score(X_test,y_test))


In [None]:
models = {
    'LogisticRegresion':{'model': LogisticRegression(solver='liblinear',multi_class='auto'),
                         'params': {
                             'C':[10,20,30]
                             }
                        },
    'RandomForest':{'model':RandomForestClassifier(),
                   'params':{'n_estimators':[10,20,30]
                            }
                   },
    'SVM': {'model':SVC(gamma='auto'),
           'params':{'kernel':['rbf','linear'],
                    'C':[10,20,30]
                    }
           }
}

In [None]:
from sklearn.model_selection import GridSearchCV
scores = []
for model_name,pm in models.items():
    clf = GridSearchCV(pm['model'],pm['params'],cv=5,return_train_score=False)
    clf.fit(df6.drop('Credit_Score',axis=1),df6.Credit_Score)
    scores.append({'model':model_name,
                  'best_score':clf.best_score_,
                  'best_params':clf.best_params_
                  }
                 )
df  = pd.DataFrame(scores)[['model','best_score','best_params']]
df