In [5]:
# !pip install pandas numpy scikit-learn seaborn matplotlib plotly xgboost imbalanced-learn fancyimpute scipy pycaret

import pandas as pd
import numpy as np
 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
#
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
#
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
# clf = SGDClassifier(loss='log', max_iter=1000,learning_rate='constant', eta0=0.01, random_state=0)
# clf.fit(X_train, y_train)
from sklearn.impute import KNNImputer
!pip install fancyimpute
from fancyimpute import IterativeImputer,KNN
#
import scipy.stats as stats
#
from sklearn import metrics
#
from imblearn.over_sampling import SMOTE
#
from sklearn.metrics import classification_report , accuracy_score , recall_score
#
from pycaret.classification import *
#
import warnings
warnings.filterwarnings('ignore')

[0m^C


ModuleNotFoundError: No module named 'fancyimpute'

# Read Train Data

In [None]:
train_data=pd.read_csv("train.csv",sep=",",encoding="utf-8")
train_data.sample(10)

###### ID: Represents a unique identification of an entry
###### Customer_ID: Represents a unique identification of a person
###### Month: Represents the month of the year
###### Name: Represents the name of a person
###### Age: Represents the age of the person
###### SSN: Represents the social security number of a person
###### Occupation: Represents the occupation of the person
###### Annual_Income: Represents the annual income of the person 
###### Monthly_Inhand_Salary: Represents the monthly base salary of a person
###### Num_Bank_Accounts: Represents the number of bank accounts a person holds
###### Num_Credit_Card: Represents the number of other credit cards held by a person
###### Interest_Rate: Represents the interest rate on credit card
###### Num_of_Loan: Represents the number of loans taken from the bank
###### Type_of_Loan: Represents the types of loan taken by a person
###### Delay_from_due_date: Represents the average number of days delayed from the payment date
###### Num_of_Delayed_Payment: Represents the average number of payments delayed by a person
###### Changed_Credit_Limit: Represents the percentage change in credit card limit
###### Num_Credit_Inquiries: Represents the number of credit card inquiries
###### Credit_Mix: Represents the classification of the mix of credits
###### Outstanding_Debt: Represents the remaining debt to be paid 
###### Credit_Utilization_Ratio: Represents the utilization ratio of credit card
###### Credit_History_Age: Represents the age of credit history of the person
###### Payment_of_Min_Amount: Represents whether only the minimum amount was paid by the person
###### Total_EMI_per_month: Represents the monthly EMI payments  
###### Amount_invested_monthly: Represents the monthly amount invested by the customer 
###### Payment_Behaviour: Represents the payment behavior of the customer 
###### Monthly_Balance:Represents the monthly balance amount of the customer 
###### Credit_Score:Represents the bracket of credit score (Poor, Standard, Good) ((Target))

In [None]:
train_data.info()

In [None]:
train_data.describe(include='object').T

In [None]:
train_data.duplicated().sum()

In [None]:
train_data.columns

In [None]:
#Checking the values in the data whether they are valid or not
for i in train_data.columns:
    if train_data[i].dtype == type(object):
        print(i,end=': ')
        print('\n',train_data[i].unique())
        print()

#### Some columns you may think that they are numerical but detected as object (categorical)


# Preprocessing and Handle invalid data
.we handle invalid data by replacing it with nan and removing in valid letters and symbols


In [None]:
train_data.isnull().sum()

In [None]:
train_data=train_data.drop(columns=["ID","Name","SSN"],axis=1)

In [None]:
train_data["Customer_ID"].duplicated().sum()

## Credit_Mix

In [None]:
train_data["Credit_Mix"].value_counts()

In [None]:
train_data['Credit_Mix']=train_data['Credit_Mix'].replace('_',np.nan)

## Changed_Credit_Limit

In [None]:
train_data["Changed_Credit_Limit"].value_counts()

In [None]:
train_data["Changed_Credit_Limit"].describe()

In [None]:
train_data['Changed_Credit_Limit']=train_data['Changed_Credit_Limit'].replace('_',np.nan)

In [None]:
train_data['Changed_Credit_Limit']=train_data['Changed_Credit_Limit'].astype("float")

In [None]:
train_data["Changed_Credit_Limit"].describe()

In [None]:
train_data['Changed_Credit_Limit']=train_data['Changed_Credit_Limit'].fillna(train_data["Changed_Credit_Limit"].mean())

In [None]:
train_data['Changed_Credit_Limit']=train_data['Changed_Credit_Limit'].round(3)

## Monthly_Balance

In [None]:
train_data["Monthly_Balance"].value_counts()

In [None]:
train_data['Monthly_Balance']=train_data['Monthly_Balance'].replace('__-333333333333333333333333333__',np.nan)
train_data['Monthly_Balance']=train_data['Monthly_Balance'].astype("float")

In [None]:
train_data['Monthly_Balance']=train_data['Monthly_Balance'].fillna(train_data["Monthly_Balance"].mean())
train_data['Monthly_Balance']=train_data['Monthly_Balance'].round(3)

## Payment_Behaviour

In [None]:
train_data["Payment_Behaviour"].value_counts()

In [None]:
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace("!@9#%8",np.nan)

In [None]:
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace("Low_spent_Small_value_payments",1)
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace("Low_spent_Medium_value_payments",2)
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace("Low_spent_Large_value_payments",3)
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace("High_spent_Small_value_payments",4)
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace("High_spent_Medium_value_payments",5)
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace("High_spent_Large_value_payments",6)

In [None]:
train_data["Payment_Behaviour"].ffill(inplace=True)

In [None]:
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].astype("int64")

## Age

In [None]:
train_data["Age"].value_counts()

In [None]:
def filter_col(value):
    if '-' in value:
        return value.split('-')[1]
    elif '_' in value:
        return value.split('_')[0]
    else:
        return value

In [None]:
train_data["Age"] = train_data["Age"].apply(filter_col)
train_data["Age"] = train_data["Age"].astype(int)

In [None]:
for i in range(len(train_data["Age"])):
    if train_data["Age"][i] > 90 or train_data["Age"][i] < 10:
        train_data["Age"][i] = np.nan
    else:
        train_data["Age"][i] = train_data["Age"][i]

## Annual_Income

In [None]:
train_data["Annual_Income"] = train_data["Annual_Income"].apply(filter_col)
train_data["Annual_Income"] = train_data["Annual_Income"].astype(float)

## Num_of_Loan

In [None]:
train_data["Num_of_Loan"]

In [None]:
train_data["Num_of_Loan"] = train_data["Num_of_Loan"].apply(filter_col)

In [None]:
train_data["Num_of_Loan"]=train_data["Num_of_Loan"].astype(int)

In [None]:
train_data[train_data["Num_of_Loan"]>100]

In [None]:
for i in range(len(train_data["Num_of_Loan"])):
    if train_data["Num_of_Loan"][i] > 100:
        train_data["Num_of_Loan"][i] = np.nan
    else:
        train_data["Num_of_Loan"][i] = train_data["Num_of_Loan"][i]

## Outstanding_Debt

In [None]:
train_data["Outstanding_Debt"] = train_data["Outstanding_Debt"].apply(filter_col)
train_data["Outstanding_Debt"] = train_data["Outstanding_Debt"].astype(float)

## Occupation

In [None]:
train_data["Occupation"].value_counts()

In [None]:
train_data["Occupation"]=train_data["Occupation"].replace("_______",np.nan)

In [None]:
train_data["Occupation"]=train_data["Occupation"].astype("object")

## Num_of_Delayed_Payment

In [None]:
train_data["Num_of_Delayed_Payment"].value_counts()

In [None]:
def filter_(value:str):
    if '_' in str(value):
        return value.split('_')[0]
    else:
        return value
train_data["Num_of_Delayed_Payment"]= train_data["Num_of_Delayed_Payment"].apply(filter_)

In [None]:
train_data["Num_of_Delayed_Payment"]= train_data["Num_of_Delayed_Payment"].astype("Int64")

## Payment_of_Min_Amount

In [None]:
train_data["Payment_of_Min_Amount"].value_counts()

## Binning for Credit_History_Age

In [None]:
train_data["Credit_History_Age"].value_counts()

In [None]:
#train data
years  = []
months = []
for value in train_data["Credit_History_Age"]:
    if value is np.nan:
        years.append(np.nan)
        months.append(np.nan)
    else:
        new_str = value.lower().split()
        years_ = int(new_str[0])
        months_ = int(new_str[new_str.index('and')+1])
        years.append(years_)
        months.append(months_)
train_data['Credit_Age_years'] = pd.Series(years)
train_data['Credit_Age_months'] = pd.Series(months) 
train_data.drop('Credit_History_Age',axis=1,inplace=True)

## Customer_ID

In [None]:
train_data["Customer_ID"].value_counts()

## Month

In [None]:
train_data["Month"].value_counts()

In [None]:
train_month={"January":1,"February":2,"March":3,"April":4,"May":5,"June":6,"July":7,"August":8}
train_data["Month"]=train_data["Month"].replace(train_month)

## Monthly_Inhand_Salary

In [None]:
train_data["Monthly_Inhand_Salary"].value_counts()

## Num_Bank_Accounts

In [None]:
train_data["Num_Bank_Accounts"].value_counts()

In [None]:
train_data[train_data["Num_Bank_Accounts"]<0]["Num_Bank_Accounts"].value_counts()

In [None]:
train_data["Num_Bank_Accounts"]=train_data["Num_Bank_Accounts"].replace(-1,0)

In [None]:
train_data[train_data["Num_Bank_Accounts"]> 300]["Num_Bank_Accounts"].value_counts()

In [None]:
for i in range(len(train_data["Num_Bank_Accounts"])) :
    if train_data["Num_Bank_Accounts"][i] > 300 :
        train_data["Num_Bank_Accounts"][i]= np.nan
    else: 
        train_data["Num_Bank_Accounts"][i]=train_data["Num_Bank_Accounts"][i]

In [None]:
train_data["Num_Bank_Accounts"]=train_data["Num_Bank_Accounts"].round().astype('Int64')


## Num_Credit_Card

In [None]:
train_data["Num_Credit_Card"].value_counts()

In [None]:
train_data[train_data["Num_Credit_Card"]<0]["Num_Credit_Card"].sum()

## Interest_Rate

In [None]:
train_data["Interest_Rate"].value_counts()

## Type_of_Loan

In [None]:
train_data["Type_of_Loan"]=train_data["Type_of_Loan"].astype("object")

In [None]:
train_data.isnull().sum()

In [None]:
train_data["Type_of_Loan"] = train_data["Type_of_Loan"].fillna(train_data["Type_of_Loan"].ffill())

In [None]:
train_data.isnull().sum()

## Delay_from_due_date

In [None]:
train_data["Delay_from_due_date"].value_counts()

In [None]:
(train_data["Delay_from_due_date"]<0).sum()

In [None]:
for i in range(len(train_data["Delay_from_due_date"])):
    if train_data["Delay_from_due_date"][i] <0:
        train_data["Delay_from_due_date"][i] = 0
    else:
        train_data["Delay_from_due_date"][i] = train_data["Delay_from_due_date"][i]

## Num_Credit_Inquiries

In [None]:
train_data["Num_Credit_Inquiries"].value_counts()

In [None]:
(train_data["Num_Credit_Inquiries"]<0).sum()

## Credit_Utilization_Ratio

In [None]:
train_data["Credit_Utilization_Ratio"].value_counts()

In [None]:
train_data["Credit_Utilization_Ratio"]=train_data["Credit_Utilization_Ratio"].round(3)


## Total_EMI_per_month

In [None]:
train_data["Total_EMI_per_month"].value_counts()

In [None]:
train_data["Total_EMI_per_month"]=train_data["Total_EMI_per_month"].astype("float")


In [None]:
train_data["Total_EMI_per_month"]=train_data["Total_EMI_per_month"].round(3)


## Amount_invested_monthly 

In [None]:
train_data["Amount_invested_monthly"].value_counts()

In [None]:
train_data["Amount_invested_monthly"]=train_data["Amount_invested_monthly"].replace("__10000__",np.nan)
train_data["Amount_invested_monthly"]=train_data["Amount_invested_monthly"].astype("float")
train_data["Amount_invested_monthly"]=train_data["Amount_invested_monthly"].round(3)

# Fill The Missing Vlues 

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #


In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(train_data.isnull())

### Random Choice

In [None]:
# for column in train_data.columns:
#     missing_indices = train_data[train_data[column].isnull()].index
#     available_values = train_data[column].dropna()
    
#     for index in missing_indices:
#         random_choice = np.random.choice(available_values)
#         train_data.at[index, column] = random_choice

In [None]:
# pd.DataFrame(train_data)

In [None]:
# plt.figure(figsize=(12,8))
# sns.heatmap(train_data.isnull())

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

## 1-Fill Numerical Data

### KNN Imputer

In [None]:
numerical_data = []
object_data = []

for column in train_data.columns:
    if train_data.dtypes[column] != 'object':
        numerical_data.append(column)
    else:
        object_data.append(column)

In [None]:
imputer = KNNImputer(n_neighbors=5)

In [None]:
train_data[numerical_data] = imputer.fit_transform(train_data[numerical_data])

In [None]:
train_data

In [None]:
train_data.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(train_data.isnull())

## 2-Fill Categorical Data

### Occupation

In [None]:
#train_data["Monthly_Inhand_Salary"].value_counts().head(20)                            #to know why i use ffill
#train_data.loc[train_data["Monthly_Inhand_Salary"] ==1359.206250, "Occupation"]        #to know why i use ffill
train_data['Occupation'].ffill(inplace=True)
#train_data["Monthly_Inhand_Salary"].value_counts().head(10)
#train_data.loc[train_data["Monthly_Inhand_Salary"] ==1359.206250, "Occupation"] 

In [None]:
train_data["Monthly_Inhand_Salary"]=train_data["Monthly_Inhand_Salary"].round(3)

### Credit_Mix

In [None]:
train_data.info()

In [None]:
train_data['Credit_Mix']=train_data['Credit_Mix'].replace("Standard",1)
train_data['Credit_Mix']=train_data['Credit_Mix'].replace("Bad",2)
train_data['Credit_Mix']=train_data['Credit_Mix'].replace("Good",3)

In [None]:
imputer = IterativeImputer(max_iter=10, random_state=0)

credit_mix_data = train_data['Credit_Mix'].values.reshape(-1, 1)

train_data['Credit_Mix'] = imputer.fit_transform(credit_mix_data)

In [None]:
train_data.isnull().sum()

# EDA

ploting the target

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(train_data['Credit_Score'], bins=20, color='blue', alpha=0.7)
plt.title('Credit score distribution')
plt.xlabel('Credit score')
plt.ylabel('Number of customers')
plt.show()

This indicates that there is a discrepancy in the distribution, and this is the problem of unbalanced data

In [None]:
train_data.columns

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=train_data['Annual_Income'], y=train_data['Credit_Score'], hue=train_data['Credit_Score'], palette='viridis', alpha=0.5)
plt.title('Relationship of credit scores and annual income')
plt.xlabel('Annual income')
plt.ylabel('Credit score')
plt.show()

In [None]:
plt.figure(figsize=(12,4))
sns.countplot(train_data['Occupation']);
plt.xticks(rotation=45);

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=train_data['Age'], y=train_data['Credit_Score'], hue=train_data['Occupation'], palette='viridis', alpha=0.5)
plt.title('Correlation of credit scores and age with a dispersion based on occupation')
plt.xlabel('Age')
plt.ylabel('Credit scores')
plt.show()

In [None]:
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace(1,"Low_spent_Small_value_payments")
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace(2,"Low_spent_Medium_value_payments")
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace(3,"Low_spent_Large_value_payments")
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace(4,"High_spent_Small_value_payments")
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace(5,"High_spent_Medium_value_payments")
train_data["Payment_Behaviour"]=train_data["Payment_Behaviour"].replace(6,"High_spent_Large_value_payments")

In [None]:
p_counts = train_data['Payment_Behaviour'].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(p_counts, labels=p_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Payment Behaviour types')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=train_data, y='Credit_Score', x='Delay_from_due_date', hue='Credit_Score', palette='coolwarm', alpha=0.7)
plt.title('Correlation of credit scores and the number of Delay from due date')
plt.xlabel('Credit score')
plt.ylabel('Delay from due date')
plt.show()

In [None]:
sns.countplot(train_data['Payment_of_Min_Amount'],palette="mako");
plt.xticks(rotation=45);

In [None]:
good_credit =train_data[train_data['Credit_Score']=="Good"]
top5=good_credit.sort_values('Annual_Income', ascending = False).head(5)

In [None]:
plt.title('Top 5 most credit worthy people', fontsize = 20)
plt.ylabel('Annual Income')
plt.bar(top5["Customer_ID"], top5.Annual_Income, tick_label = top5.Customer_ID, color = 'orange', width = 0.4);

###### Check correlations between features

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(train_data.corr(),annot=True)
plt.show

In [None]:
sns.distplot(train_data['Monthly_Inhand_Salary']);

In [None]:
grid = sns.FacetGrid(train_data, col = 'Credit_Score')
grid.map(sns.distplot, 'Monthly_Inhand_Salary')

In [None]:
sns.distplot(train_data['Delay_from_due_date']);

# Encoding

In [None]:
train_data.info()

In [None]:
train_data = train_data.apply(lambda x: x.astype('category').cat.codes)
train_data.head()

# Showing Outliers 

In [None]:
train_data.describe()

In [None]:
for i in range(0,len(numerical_data)):
    plt.figure(figsize=(10,1),dpi=80)
    sns.boxplot(x = train_data[numerical_data[i]], y=train_data['Credit_Score'], data = train_data,orient="h")
    plt.show()

In [None]:
train_data.describe()

# Feature Selection

In [None]:
# Create a contingency table for each categorical column
for col in train_data.columns:
    contingency_table = pd.crosstab(train_data[col],train_data['Credit_Score'])
    # Apply the chi-square test
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    print(f"Chi-square test results for {col}:")
    print(f"Chi-square statistic: {chi2}")
    print(f"P-value: {p}")
    print(f"Degrees of freedom: {dof}")
    print(f"Expected frequencies table:\n{expected}\n")

# Splitting Data

In [None]:
x= train_data.drop(['Credit_Score','Monthly_Balance','Amount_invested_monthly','Credit_Utilization_Ratio','Occupation','Month'],axis=1)
y=train_data['Credit_Score']
x_train , x_test , y_train , y_test =train_test_split(x,y,test_size=0.2,random_state=42)

# Balanced Data

In [None]:
labels = train_data["Credit_Score"].value_counts().index
sizes = train_data["Credit_Score"].value_counts()

plt.figure(figsize = (10,10))
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title('Credit_Score Percentage',color = 'black',fontsize = 30)
plt.legend(train_data["Credit_Score"].value_counts())
plt.show()

In [None]:
sm = SMOTE(k_neighbors=7)
x_train_sm,y_train_sm=sm.fit_resample(x_train,y_train)

In [None]:
data_after_smote = pd.DataFrame(x_train_sm, columns=x_train.columns)
data_after_smote['target'] = y_train_sm 

In [None]:
data_after_smote['target']

In [None]:
labels = data_after_smote['target'].value_counts().index
sizes = data_after_smote['target'].value_counts()

plt.figure(figsize = (10,10))
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title('Credit_Score Percentage',color = 'black',fontsize = 30)
plt.legend(data_after_smote['target'].value_counts())
plt.show()

In [None]:
data_after_smote

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(data_after_smote['target'], bins=20, color='blue', alpha=0.7)
plt.title('Credit score distribution')
plt.xlabel('Credit score')
plt.ylabel('Number of customers')
plt.show()

# Splittind Data After Smote

In [None]:
X= data_after_smote.drop(['target'],axis=1)
Y=data_after_smote['target']

# Scaling

In [None]:
sc = StandardScaler()
X= sc.fit_transform(X)

#### preparation test data

In [None]:
test_data=pd.read_csv("test.csv",sep=",",encoding="utf-8")
test_data=test_data.drop(columns=["ID","Name","SSN"],axis=1)
test_data["Payment_Behaviour"]=test_data["Payment_Behaviour"].replace("Low_spent_Small_value_payments",1)
test_data["Payment_Behaviour"]=test_data["Payment_Behaviour"].replace("Low_spent_Medium_value_payments",2)
test_data["Payment_Behaviour"]=test_data["Payment_Behaviour"].replace("Low_spent_Large_value_payments",3)
test_data["Payment_Behaviour"]=test_data["Payment_Behaviour"].replace("High_spent_Small_value_payments",4)
test_data["Payment_Behaviour"]=test_data["Payment_Behaviour"].replace("High_spent_Medium_value_payments",5)
test_data["Payment_Behaviour"]=test_data["Payment_Behaviour"].replace("High_spent_Large_value_payments",6)


In [None]:
#test data
years  = []
months = []
for value in test_data["Credit_History_Age"]:
    if value is np.nan:
        years.append(np.nan)
        months.append(np.nan)
    else:
        new_str = value.lower().split()
        years_ = int(new_str[0])
        months_ = int(new_str[new_str.index('and')+1])
        years.append(years_)
        months.append(months_)
test_data['Credit_Age_years'] = pd.Series(years)
test_data['Credit_Age_months'] = pd.Series(months) 
test_data.drop('Credit_History_Age',axis=1,inplace=True)

In [None]:
test_data["Total_EMI_per_month"]=test_data["Total_EMI_per_month"].round(3)
test_data["Total_EMI_per_month"]=test_data["Total_EMI_per_month"].astype("float")
test_data["Credit_Utilization_Ratio"]=test_data["Credit_Utilization_Ratio"].round(3)
test_data["Type_of_Loan"]=test_data["Type_of_Loan"].astype("object")
test_data["Num_Bank_Accounts"]=test_data["Num_Bank_Accounts"].round().astype('Int64')
test_data["Monthly_Inhand_Salary"]=test_data["Monthly_Inhand_Salary"].round(3)

In [None]:
test_data['Credit_Mix']=test_data['Credit_Mix'].replace("Standard",1)
test_data['Credit_Mix']=test_data['Credit_Mix'].replace("Bad",2)
test_data['Credit_Mix']=test_data['Credit_Mix'].replace("Good",3)

In [None]:
test_month={"September":9,"October":10,"November":11,"December":12}
test_data["Month"]=test_data["Month"].replace(test_month)

In [None]:
# columns=train_data.drop(['Credit_Score','Monthly_Balance','Amount_invested_monthly','Credit_Utilization_Ratio','Occupation','Month'],axis=1).columns

In [None]:
test_data=test_data.drop(['Monthly_Balance','Amount_invested_monthly','Credit_Utilization_Ratio','Occupation','Month'], axis=1)

In [None]:
test_data.info()

In [None]:
test_data = test_data.apply(lambda x: x.astype('category').cat.codes)
test_data.head()

# Split Data

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,train_size=0.3,random_state=42)

In [None]:
pd.DataFrame(x_train_sm)

In [None]:
y_test

# Modeling

### RF

In [None]:
RF = RandomForestClassifier(n_estimators=100, random_state=42)  
RF.fit(x_train, y_train)

In [None]:
RF.score(x_train,y_train)

In [None]:
RF.score(x_test,y_test)

In [None]:
Y_pred=RF.predict(x_test)
Y_pred

In [None]:
df=pd.DataFrame({"y_predect":Y_pred,"y_test":y_test})
df

In [None]:
report = classification_report(y_test, Y_pred)
print(report)

#### use test data to prediction

In [None]:
pred=RF.predict(test_data)
pred

### Xgboost

In [None]:
model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, random_state=42)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
recall = recall_score(y_test, Y_pred, average='macro')  
print(f"Recall: {recall:.2f}")

In [None]:
report = classification_report(y_test, y_pred)
print(report)

In [None]:
Y_pred=model.predict(test_data)
Y_pred

In [None]:
(Y_pred[Y_pred>0.49].shape[0]/Y_pred.shape[0])*100

## PyCaret

In [None]:
s = setup(x_train_sm, target = y_train_sm,session_id = 123)

In [None]:
best = compare_models()

In [None]:
plot_model(best, plot = 'confusion_matrix')

In [None]:
# predict on test set
p = predict_model(best)

In [None]:
p.head()

In [None]:
test_data.head()