In [105]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(2)

In [106]:
df=pd.read_csv('lend-or-lose/train.csv')
df

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,DRIRC89L0T,18,137576,209136,846,26,2,10.47,60,0.81,High School,Self-employed,Single,Yes,No,Business,No,0
1,TS0FIUNHNU,47,57194,5970,748,30,2,19.72,36,0.73,High School,Unemployed,Divorced,No,Yes,Education,No,0
2,I0YR284A1V,26,84328,95065,453,7,2,24.25,12,0.45,Master's,Self-employed,Married,No,No,Other,Yes,0
3,WB1T7NQV8A,53,49795,229582,533,107,3,14.44,60,0.17,Bachelor's,Self-employed,Single,Yes,No,Auto,Yes,1
4,J6GU9M4G1Z,49,115450,22072,840,0,4,24.48,12,0.11,Bachelor's,Part-time,Single,No,Yes,Education,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204272,KYS1HKNGGE,40,116623,161673,651,79,2,23.44,12,0.87,Bachelor's,Part-time,Divorced,No,No,Home,Yes,0
204273,5MAOH3AOZO,67,62958,189499,460,77,3,9.29,36,0.11,Bachelor's,Self-employed,Single,No,No,Business,Yes,0
204274,5Y9Z6NW29X,62,34372,59645,524,94,3,9.72,60,0.24,PhD,Full-time,Single,Yes,No,Auto,No,0
204275,O51974F566,44,146262,198454,489,7,4,4.31,48,0.30,High School,Self-employed,Married,Yes,No,Home,No,0


In [107]:
col_to_convert=["Education","EmploymentType","MaritalStatus","HasMortgage","HasDependents","LoanPurpose","HasCoSigner"]
le_dict={}

for col in col_to_convert:
    le=LabelEncoder()
    le_dict[col]=le
    df[col]=le.fit_transform(df[col])

df

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,DRIRC89L0T,18,137576,209136,846,26,2,10.47,60,0.81,1,2,2,1,0,1,0,0
1,TS0FIUNHNU,47,57194,5970,748,30,2,19.72,36,0.73,1,3,0,0,1,2,0,0
2,I0YR284A1V,26,84328,95065,453,7,2,24.25,12,0.45,2,2,1,0,0,4,1,0
3,WB1T7NQV8A,53,49795,229582,533,107,3,14.44,60,0.17,0,2,2,1,0,0,1,1
4,J6GU9M4G1Z,49,115450,22072,840,0,4,24.48,12,0.11,0,1,2,0,1,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204272,KYS1HKNGGE,40,116623,161673,651,79,2,23.44,12,0.87,0,1,0,0,0,3,1,0
204273,5MAOH3AOZO,67,62958,189499,460,77,3,9.29,36,0.11,0,2,2,0,0,1,1,0
204274,5Y9Z6NW29X,62,34372,59645,524,94,3,9.72,60,0.24,3,0,2,1,0,0,0,0
204275,O51974F566,44,146262,198454,489,7,4,4.31,48,0.30,1,2,1,1,0,3,0,0


In [108]:
features=df.columns.to_numpy()
features=features[(features!="LoanID") & (features!="Default")]
features  # Removes values at indices 1 and 3

array(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio',
       'Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage',
       'HasDependents', 'LoanPurpose', 'HasCoSigner'], dtype=object)

In [109]:
null_value_percentages=(df.isna().sum()/df.shape[0])*100
null_value_percentages

LoanID            0.0
Age               0.0
Income            0.0
LoanAmount        0.0
CreditScore       0.0
MonthsEmployed    0.0
NumCreditLines    0.0
InterestRate      0.0
LoanTerm          0.0
DTIRatio          0.0
Education         0.0
EmploymentType    0.0
MaritalStatus     0.0
HasMortgage       0.0
HasDependents     0.0
LoanPurpose       0.0
HasCoSigner       0.0
Default           0.0
dtype: float64

In [110]:
# 3. Classifiers:
# Probabilistic Classifier: Bayes classifier, Logistic regression
# Non-Probabilistic classifier: K-nearest neighbours
# 4. Decision Trees
# 5. Random Forests
# 6. Gradient Boosting
# Adaboost

In [111]:
scaler=StandardScaler()
normalized_features=scaler.fit_transform(df[features])
normalized_features=scaler.transform(df[features])

normalized_features

array([[-1.69983849e+00,  1.41378519e+00,  1.15148725e+00, ...,
        -1.00040639e+00, -7.07809999e-01, -1.00056312e+00],
       [ 2.34119601e-01, -6.49831145e-01, -1.71586596e+00, ...,
         9.99593771e-01, -3.11708348e-04, -1.00056312e+00],
       [-1.16633281e+00,  4.67696561e-02, -4.58436895e-01, ...,
        -1.00040639e+00,  1.41468487e+00,  9.99437197e-01],
       ...,
       [ 1.23444275e+00, -1.23573162e+00, -9.58331812e-01, ...,
        -1.00040639e+00, -1.41530829e+00, -1.00056312e+00],
       [ 3.40549710e-02,  1.63677754e+00,  1.00072842e+00, ...,
        -1.00040639e+00,  7.07186583e-01, -1.00056312e+00],
       [-4.99450708e-01,  3.93760753e-01,  8.61556661e-01, ...,
        -1.00040639e+00,  7.07186583e-01, -1.00056312e+00]])

In [112]:
class OutlierRemoval:
    def __init__(self, col):
        q1 = col.quantile(0.25)
        q3 = col.quantile(0.75)
        inter_quartile_range = q3 - q1
        self.upper_whisker = q3 + inter_quartile_range * 1.5
        self.lower_whisker = q1 - inter_quartile_range * 1.5
        self.has_outliers = False  # Track if outliers exist

    def replace_outlier(self, value):
        # Check if the value is an outlier and set `has_outliers` to True if it is
        if value < self.lower_whisker or value > self.upper_whisker:
            self.has_outliers = True
            return min(max(value, self.lower_whisker), self.upper_whisker)
        return value

def replace_outliers(df, columns):
    outlier_columns = []  # To store column names with outliers
    for col in columns:
        outlier_remover = OutlierRemoval(df[col])
        df[col] = df[col].apply(outlier_remover.replace_outlier)
        if outlier_remover.has_outliers:  # If outliers were replaced
            outlier_columns.append(col)
    print("Columns with outliers:", outlier_columns)  # Print the columns with outliers
    return df

# Replace outliers in the DataFrame `df` for selected columns
df = replace_outliers(df, features)
df


Columns with outliers: []


Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,DRIRC89L0T,18,137576,209136,846,26,2,10.47,60,0.81,1,2,2,1,0,1,0,0
1,TS0FIUNHNU,47,57194,5970,748,30,2,19.72,36,0.73,1,3,0,0,1,2,0,0
2,I0YR284A1V,26,84328,95065,453,7,2,24.25,12,0.45,2,2,1,0,0,4,1,0
3,WB1T7NQV8A,53,49795,229582,533,107,3,14.44,60,0.17,0,2,2,1,0,0,1,1
4,J6GU9M4G1Z,49,115450,22072,840,0,4,24.48,12,0.11,0,1,2,0,1,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204272,KYS1HKNGGE,40,116623,161673,651,79,2,23.44,12,0.87,0,1,0,0,0,3,1,0
204273,5MAOH3AOZO,67,62958,189499,460,77,3,9.29,36,0.11,0,2,2,0,0,1,1,0
204274,5Y9Z6NW29X,62,34372,59645,524,94,3,9.72,60,0.24,3,0,2,1,0,0,0,0
204275,O51974F566,44,146262,198454,489,7,4,4.31,48,0.30,1,2,1,1,0,3,0,0


In [113]:
X_train, X_test, y_train, y_test = train_test_split(normalized_features, df["Default"], test_size=0.3, random_state=2)

In [114]:
model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8829221330200379


In [118]:
test_df=pd.read_csv("lend-or-lose/test.csv")

for col, le in le_dict.items():
    test_df[col]=le.fit_transform(test_df[col])

y_pred = model.predict(test_df[features])
submission=pd.DataFrame({
    "LoanID":test_df["LoanID"],
    "Default": y_pred
})



In [119]:
submission.to_csv("NaiveBayesClassifier.csv",index=False)