# Importing Libraries 

In [1]:
# Data Manupulation Libraries
import numpy as np
import pandas as pd

# Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
pio.renderers.default = 'iframe'
pio.templates.default="simple_white"

# Statistical Libraries
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu
from scipy.stats import ttest_ind

#
from sklearn.model_selection import train_test_split

# Preprocessing libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder, MinMaxScaler
from sklearn.feature_selection import GenericUnivariateSelect
from sklearn.pipeline import Pipeline,make_pipeline,FeatureUnion
from sklearn.compose import ColumnTransformer

# model evalution matrics
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_curve,f1_score,recall_score,precision_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import loguniform,uniform

#models
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB

#imbalanced data libraries
from imblearn.ensemble import BalancedBaggingClassifier,BalancedRandomForestClassifier,EasyEnsembleClassifier,RUSBoostClassifier


#visualising data pipline
from sklearn import set_config

set_config(display='diagram')

# Importing Dataset

In [2]:
df=pd.read_csv("BankChurners_edited.csv")

In [3]:
df

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.000
3,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.760
4,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.500,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,Existing Customer,50,M,2,Graduate,Single,$40K - $60K,Blue,40,3,2,3,4003.0,1851,2152.0,0.703,15476,117,0.857,0.462
10123,Attrited Customer,41,M,2,Unknown,Divorced,$40K - $60K,Blue,25,4,2,3,4277.0,2186,2091.0,0.804,8764,69,0.683,0.511
10124,Attrited Customer,44,F,1,High School,Married,Less than $40K,Blue,36,5,3,4,5409.0,0,5409.0,0.819,10291,60,0.818,0.000
10125,Attrited Customer,30,M,2,Graduate,Unknown,$40K - $60K,Blue,36,4,3,3,5281.0,0,5281.0,0.535,8395,62,0.722,0.000


# Exploratory Data Analysis

**dataframe shape**

In [13]:
df.shape

(10127, 20)

**dataset information**

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            10127 non-null  object 
 1   Customer_Age              10127 non-null  int64  
 2   Gender                    10127 non-null  object 
 3   Dependent_count           10127 non-null  int64  
 4   Education_Level           10127 non-null  object 
 5   Marital_Status            10127 non-null  object 
 6   Income_Category           10127 non-null  object 
 7   Card_Category             10127 non-null  object 
 8   Months_on_book            10127 non-null  int64  
 9   Total_Relationship_Count  10127 non-null  int64  
 10  Months_Inactive_12_mon    10127 non-null  int64  
 11  Contacts_Count_12_mon     10127 non-null  int64  
 12  Credit_Limit              10127 non-null  float64
 13  Total_Revolving_Bal       10127 non-null  int64  
 14  Avg_Op

**Missing Values**

In [16]:
df.isnull().sum()

Attrition_Flag              0
Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64

## Univariate Analysis

### Attrition_Flag

In [7]:
temp_df=df.Attrition_Flag.value_counts().reset_index()
temp_df.columns=['Attrition_Flag','Count']
temp_df

Unnamed: 0,Attrition_Flag,Count
0,Existing Customer,8500
1,Attrited Customer,1627


In [12]:
fig=px.pie(temp_df,values="Count",names="Attrition_Flag",title="Attrition Flag")
fig.write_image("Attrition.png")
fig.show()

**Conclusion**

- The target variable has 83.9% Existing Customers lable and 16.1% Attrition Customers  which indicatews that the dataset is imbalanced.

### Customer_Age

In [18]:
df.Customer_Age.describe()

count    10127.000000
mean        46.325960
std          8.016814
min         26.000000
25%         41.000000
50%         46.000000
75%         52.000000
max         73.000000
Name: Customer_Age, dtype: float64

In [17]:
fig=px.histogram(df,x="Customer_Age",title="Customer Age Histogram")
fig.show(renderer='iframe')

In [19]:
df.Customer_Age.skew()

-0.033605016317173456

**Conclusion**
- Since mean and median is approximately equal and skewness is close to 0, we can say that customer age approximately follows normal distribution.

### Gender

In [20]:
temp_df=df.Gender.value_counts().reset_index()
temp_df.columns=["Gender","Count"]
temp_df

Unnamed: 0,Gender,Count
0,F,5358
1,M,4769


In [21]:
fig=px.pie(temp_df,values="Count",names="Gender",title="Pie Chart of Gender")
fig.write_image("Gender.png")
fig.show()

**Conclusions**
- In the dataset, we have 52.9% female customers and 47.1% male customers.

### Dependent_Count

In [22]:
temp_df=df.Dependent_count.value_counts().reset_index()
temp_df.columns=["Dependent_Count","Count"]
temp_df

Unnamed: 0,Dependent_Count,Count
0,3,2732
1,2,2655
2,1,1838
3,4,1574
4,0,904
5,5,424


In [23]:
fig=px.bar(temp_df,x=temp_df.Dependent_Count,y=temp_df.Count,text_auto=True, labels={"index":'No of Dependent Count',                                                                                      'Dependent_count':'Counts'})
fig.show(renderer='iframe')

In [25]:
px.pie(temp_df,values="Count",names='Dependent_Count',title="Pie Chart of Dependent_Count")

In [26]:
# discriptive Summary
df.Dependent_count.describe()

count    10127.000000
mean         2.346203
std          1.298908
min          0.000000
25%          1.000000
50%          2.000000
75%          3.000000
max          5.000000
Name: Dependent_count, dtype: float64

In [4]:
cols_to_drop=['Gender','Income_Category','Education_Level','Marital_Status','Card_Category','Customer_Age','Months_on_book','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1']

In [5]:
df.drop(cols_to_drop,inplace=True,axis=1)

In [7]:
df.columns.shape

(11,)

In [41]:
X=df.drop("Attrition_Flag",axis=1)
y=df.Attrition_Flag

In [42]:
y.replace(["Existing Customer","Attrited Customer"],[0,1],inplace=True)

In [43]:
y

0        0
1        0
2        0
3        0
4        0
        ..
10122    0
10123    1
10124    1
10125    1
10126    1
Name: Attrition_Flag, Length: 10127, dtype: int64

# Buliding Pipeline for data preprocessing 

In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42,stratify=y)

In [45]:
print("Shape of X_train: ",X_train.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train:  (7595, 10)
Shape of X_test: (2532, 10)


# Baseline Model 

In [46]:
recall_basic=list()
recall_under=list()
recall_over=list()
recall_hybrid=list()

In [47]:
f1_basic=list()
f1_under=list()
f1_over=list()
f1_hybrid=list()

In [48]:
precision_basic=list()
precision_under=list()
precision_over=list()
precision_hybrid=list()

In [49]:
accuracy_basic=list()
accuracy_under=list()
accuracy_over=list()
accuracy_hybrid=list()

### Naive Bayes

In [50]:
nb_pipe=make_pipeline(MinMaxScaler(),GaussianNB())

In [51]:
nb_pipe.fit(X_train,y_train)

In [52]:
print(classification_report(y_test,nb_pipe.predict(X_test),zero_division=0))

              precision    recall  f1-score   support

           0       0.92      0.93      0.93      2125
           1       0.63      0.59      0.61       407

    accuracy                           0.88      2532
   macro avg       0.78      0.76      0.77      2532
weighted avg       0.88      0.88      0.88      2532



In [53]:
recall_basic.append(recall_score(y_test,nb_pipe.predict(X_test)).round(2))
f1_basic.append(f1_score(y_test,nb_pipe.predict(X_test)).round(2))
precision_basic.append(precision_score(y_test,nb_pipe.predict(X_test)).round(2))
accuracy_basic.append(accuracy_score(y_test,nb_pipe.predict(X_test)).round(2))

### Logistic Regression

In [54]:
log_pipe=make_pipeline(StandardScaler(),LogisticRegression(random_state=42,max_iter=500))

In [55]:
log_pipe.fit(X_train,y_train)

In [56]:
print(classification_report(y_test,log_pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      2125
           1       0.78      0.51      0.62       407

    accuracy                           0.90      2532
   macro avg       0.84      0.74      0.78      2532
weighted avg       0.89      0.90      0.89      2532



In [57]:
recall_basic.append(recall_score(y_test,log_pipe.predict(X_test)).round(2))
f1_basic.append(f1_score(y_test,log_pipe.predict(X_test)).round(2))
precision_basic.append(precision_score(y_test,log_pipe.predict(X_test)).round(2))
accuracy_basic.append(accuracy_score(y_test,log_pipe.predict(X_test)).round(2))

### Decision Tree

In [58]:
dt_pipe=make_pipeline(StandardScaler(),DecisionTreeClassifier())

In [59]:
dt_pipe.fit(X_train,y_train)

In [60]:
print(classification_report(y_test,dt_pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      2125
           1       0.78      0.75      0.77       407

    accuracy                           0.93      2532
   macro avg       0.87      0.86      0.86      2532
weighted avg       0.93      0.93      0.93      2532



In [61]:
recall_basic.append(recall_score(y_test,dt_pipe.predict(X_test)).round(2))
f1_basic.append(f1_score(y_test,dt_pipe.predict(X_test)).round(2))
precision_basic.append(precision_score(y_test,dt_pipe.predict(X_test)).round(2))
accuracy_basic.append(accuracy_score(y_test,dt_pipe.predict(X_test)).round(2))

## Under Sampling

In [62]:
def undersampling(X,y,n=3):
    
    #resetting the index of both X and y
    X.reset_index(drop=True,inplace=True)
    y.reset_index(drop=True,inplace=True)
    
    # creating a dataframe of counts corresponding to each category
    temp_df=y.value_counts().reset_index()
    temp_df.columns=[y.name,"Count"]
    
    #finding the majority label, minority label, majority count minority count
    majority_label=temp_df[y.name][temp_df.Count.idxmax()]
    majority_count=temp_df["Count"][temp_df.Count.idxmax()]
    minority_label=temp_df[y.name][temp_df.Count.idxmin()]
    minority_count=temp_df["Count"][temp_df.Count.idxmax()]
    
    #creating subsets of dataset based on the majority label and minority label
    X_minor=X[y==minority_label].reset_index(drop=True)
    X_major=X[y==majority_label].reset_index(drop=True)
    
    #finding the mean of majority category
    mean=X_major.mean(axis=0).values
    
    #calculating distance of every minority class point from the mean of majority class
    distance=np.array([])
    for i in range(X_minor.shape[0]):
        d=np.sqrt(sum((X_minor.iloc[i].values-mean)**2))
        distance=np.append(distance,d)
    
    #finding the indices of minority points according to the distance (in ascending order)
    minority_indices=np.argsort(distance)
    
    #reordering the X_minor according to the minority index
    X_minor=X_minor.iloc[minority_indices]
    
    # Find the indices to drop from X_major based on the n closest majority points for each minority point
    for i in range(X_minor.shape[0]):
        distance=np.array([])
        for j in range(X_major.shape[0]):
            d=np.sqrt(sum((X_major.iloc[j].values-X_minor.iloc[i].values)**2))
            distance=np.append(distance,d)
        
        indices=np.argsort(distance)[0:n]
    
        # Drop the 4 points from X_major for this iteration
        X_major.drop(index=indices, axis=0, inplace=True)
        X_major.reset_index(drop=True,inplace=True)
    
    #Creating new pandas series of the size of X_major and X_minor 
    y_major=pd.Series(np.repeat(majority_label,X_major.shape[0]))
    y_minor=pd.Series(np.repeat(minority_label,X_minor.shape[0]))
    
    #concatinating X_major and X_minor additionally y_major and y_minor
    X_new=pd.concat([X_major,X_minor],axis=0).reset_index(drop=True)
    y_new=pd.concat([y_major,y_minor],axis=0).reset_index(drop=True)
    y_new.name=y.name
    
    #concatinating the X_new and y_new so that i can can suffle the row along with their corresponding labels
    X_y=pd.concat([X_new,y_new],axis=1)
    
    #suffeling the dataframe rows
    X_y=X_y.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    #splitting the X_y dataframe and feature matrix and target variable 
    columns_list=X_y.columns.to_list()
    X = X_y[columns_list[:-1]]
    y = X_y[columns_list[-1]]
    

    return X,y

In [63]:
X_train_under,y_train_under=undersampling(X_train,y_train)

In [64]:
print("Shape of X_train: ",X_train_under.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train:  (3935, 10)
Shape of X_test: (2532, 10)


### Naive Bayes

In [65]:
nb_pipe.fit(X_train_under,y_train_under)

In [66]:
print(classification_report(y_test,nb_pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      0.72      0.82      2125
           1       0.35      0.79      0.49       407

    accuracy                           0.73      2532
   macro avg       0.65      0.76      0.65      2532
weighted avg       0.85      0.73      0.77      2532



In [67]:
recall_under.append(recall_score(y_test,nb_pipe.predict(X_test)).round(2))
f1_under.append(f1_score(y_test,nb_pipe.predict(X_test)).round(2))
precision_under.append(precision_score(y_test,nb_pipe.predict(X_test)).round(2))
accuracy_under.append(accuracy_score(y_test,nb_pipe.predict(X_test)).round(2))

### Logistic Regression

In [68]:
log_pipe.fit(X_train_under,y_train_under)

In [69]:
print(classification_report(y_test,log_pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.96      0.80      0.87      2125
           1       0.45      0.83      0.58       407

    accuracy                           0.81      2532
   macro avg       0.70      0.81      0.73      2532
weighted avg       0.88      0.81      0.83      2532



In [70]:
recall_under.append(recall_score(y_test,log_pipe.predict(X_test)).round(2))
f1_under.append(f1_score(y_test,log_pipe.predict(X_test)).round(2))
precision_under.append(precision_score(y_test,log_pipe.predict(X_test)).round(2))
accuracy_under.append(accuracy_score(y_test,log_pipe.predict(X_test)).round(2))

### Decision Tree

In [71]:
dt_pipe.fit(X_train_under,y_train_under)

In [72]:
print(classification_report(y_test,dt_pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.70      0.82      2125
           1       0.37      0.94      0.54       407

    accuracy                           0.74      2532
   macro avg       0.68      0.82      0.68      2532
weighted avg       0.89      0.74      0.77      2532



In [73]:
recall_under.append(recall_score(y_test,dt_pipe.predict(X_test)).round(2))
f1_under.append(f1_score(y_test,dt_pipe.predict(X_test)).round(2))
precision_under.append(precision_score(y_test,dt_pipe.predict(X_test)).round(2))
accuracy_under.append(accuracy_score(y_test,dt_pipe.predict(X_test)).round(2))

## Over Sampling

In [74]:
def oversampling(X,y,n=3):
    
    #resetting the index of both X and y
    X.reset_index(drop=True,inplace=True)
    y.reset_index(drop=True,inplace=True)

    # creating a dataframe of counts corresponding to each category
    temp_df=y.value_counts().reset_index()
    temp_df.columns=[y.name,"Count"]
    
    #finding the majority label, minority label, majority count minority count
    majority_label=temp_df[y.name][temp_df.Count.idxmax()]
    majority_count=temp_df["Count"][temp_df.Count.idxmax()]
    minority_label=temp_df[y.name][temp_df.Count.idxmin()]
    minority_count=temp_df["Count"][temp_df.Count.idxmax()]
    
    #creating subsets of dataset based on the majority label and minority label
    X_minor=X[y==minority_label].reset_index(drop=True)
    X_major=X[y==majority_label].reset_index(drop=True)
    
    # Oversampling
    X_oversample=pd.DataFrame(columns=X_minor.columns)
    
    while (X_minor.shape[0]+X_oversample.shape[0]<X_major.shape[0]):
        mean_row=X_minor.sample(n).mean()
        X_oversample = pd.concat([X_oversample, mean_row.to_frame().T], ignore_index=True)
        
    
    #Creating new pandas series of the size of X_major and X_minor 
    y_major=pd.Series(np.repeat(majority_label,X_major.shape[0]))
    y_minor=pd.Series(np.repeat(minority_label,(X_minor.shape[0]+X_oversample.shape[0])))
    
    #concatinating X_major and X_minor additionally y_major and y_minor
    X_new=pd.concat([X_major,X_minor,X_oversample],axis=0).reset_index(drop=True)
    y_new=pd.concat([y_major,y_minor],axis=0).reset_index(drop=True)
    y_new.name=y.name
    
    #concatinating the X_new and y_new so that i can can suffle the row along with their corresponding labels
    X_y=pd.concat([X_new,y_new],axis=1)
    
    #suffeling the dataframe rows
    X_y=X_y.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    #splitting the X_y dataframe and feature matrix and target variable 
    columns_list=X_y.columns.to_list()
    X = X_y[columns_list[:-1]]
    y = X_y[columns_list[-1]]
    

    return X,y
    

In [75]:
X_train_over,y_train_over=oversampling(X_train,y_train)

In [76]:
print("X_train shape",X_train.shape)
print("X_train over shape",X_train_over.shape)

X_train shape (7595, 10)
X_train over shape (12750, 10)


### 

### Naive Bayes

In [77]:
nb_pipe.fit(X_train_over,y_train_over)

In [78]:
print(classification_report(y_test,nb_pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.87      0.89      2125
           1       0.46      0.58      0.51       407

    accuracy                           0.82      2532
   macro avg       0.69      0.72      0.70      2532
weighted avg       0.84      0.82      0.83      2532



In [79]:
recall_over.append(recall_score(y_test,nb_pipe.predict(X_test)).round(2))
f1_over.append(f1_score(y_test,nb_pipe.predict(X_test)).round(2))
precision_over.append(precision_score(y_test,nb_pipe.predict(X_test)).round(2))
accuracy_over.append(accuracy_score(y_test,nb_pipe.predict(X_test)).round(2))

### Logistic Regression

In [80]:
log_pipe.fit(X_train_over,y_train_over)

In [81]:
print(classification_report(y_test,log_pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      0.88      0.92      2125
           1       0.56      0.78      0.65       407

    accuracy                           0.86      2532
   macro avg       0.76      0.83      0.78      2532
weighted avg       0.89      0.86      0.87      2532



In [82]:
recall_over.append(recall_score(y_test,log_pipe.predict(X_test)).round(2))
f1_over.append(f1_score(y_test,log_pipe.predict(X_test)).round(2))
precision_over.append(precision_score(y_test,log_pipe.predict(X_test)).round(2))
accuracy_over.append(accuracy_score(y_test,log_pipe.predict(X_test)).round(2))

### Decision Tree

In [83]:
dt_pipe.fit(X_train_over,y_train_over)

In [84]:
print(classification_report(y_test,dt_pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94      2125
           1       0.69      0.75      0.72       407

    accuracy                           0.91      2532
   macro avg       0.82      0.84      0.83      2532
weighted avg       0.91      0.91      0.91      2532



In [85]:
recall_over.append(recall_score(y_test,dt_pipe.predict(X_test)).round(2))
f1_over.append(f1_score(y_test,dt_pipe.predict(X_test)).round(2))
precision_over.append(precision_score(y_test,dt_pipe.predict(X_test)).round(2))
accuracy_over.append(accuracy_score(y_test,dt_pipe.predict(X_test)).round(2))

## Hybrid Samplng

In [86]:
def hybridsampling(X,y,us_ratio=0.5,os_ratio=1,n=3,m=3):
    
    #resetting the index of both X and y
    X.reset_index(drop=True,inplace=True)
    y.reset_index(drop=True,inplace=True)
    
    #dropping categorical variables
    cat_var=X.select_dtypes("object").columns
    X.drop(cat_var,axis=1,inplace=True)

    # creating a dataframe of counts corresponding to each category
    temp_df=y.value_counts().reset_index()
    temp_df.columns=[y.name,"Count"]
    
    #finding the majority label, minority label, majority count minority count
    majority_label=temp_df[y.name][temp_df.Count.idxmax()]
    majority_count=temp_df["Count"][temp_df.Count.idxmax()]
    minority_label=temp_df[y.name][temp_df.Count.idxmin()]
    minority_count=temp_df["Count"][temp_df.Count.idxmax()]
    
    #creating subsets of dataset based on the majority label and minority label
    X_minor=X[y==minority_label].reset_index(drop=True)
    X_major=X[y==majority_label].reset_index(drop=True)
    
    # Oversampling
    X_oversample=pd.DataFrame(columns=X_minor.columns)
    
    # Number of rows to calculate the mean
    window_size = 4

    # Loop through the original DataFrame and calculate the mean for each window
    for i in range(len(X_minor) - window_size + 1):
        window = X_minor.iloc[i:i+window_size]
        mean_row = window.mean()
        X_oversample = pd.concat([X_oversample, mean_row.to_frame().T], ignore_index=True)
        
    # UnderSample
    
    #finding the mean of majority category
    mean=X_major.mean(axis=0).values
    
    #calculating distance of every minority class point from the mean of majority class
    distance=np.array([])
    for i in range(X_minor.shape[0]):
        d=np.sqrt(sum((X_minor.iloc[i].values-mean)**2))
        distance=np.append(distance,d)
    #finding the indices of minority points according to the distance (in ascending order)
    minority_indices=np.argsort(distance)
    
    #reordering the X_minor according to the minority index
    X_minor=X_minor.iloc[minority_indices]
    
    # finding number of points to be dropped
    #n=np.int16(X_major.shape[0]/X_minor.shape[0])
    
    #n_under=int(us_ratio*len(X_minor))
    #X_under=X_minor.iloc[:n_under,:]
    
    
    # Find the indices to drop from X_major based on the 4 closest majority points for each minority point
    for i in range(X_minor.shape[0]):
        while(X_major.shape[0]>(X_minor.shape[0]+X_oversample.shape[0])):
            distance=np.array([])
            for j in range(X_major.shape[0]):
                d=np.sqrt(sum((X_major.iloc[j].values-X_minor.iloc[i].values)**2))
                distance=np.append(distance,d)

            indices=np.argsort(distance)[0:n]

            # Drop the 4 points from X_major for this iteration
            X_major.drop(index=indices, axis=0, inplace=True)
            X_major.reset_index(drop=True,inplace=True)
    
    #Creating new pandas series of the size of X_major and X_minor 
    y_major=pd.Series(np.repeat(majority_label,X_major.shape[0]))
    y_minor=pd.Series(np.repeat(minority_label,(X_minor.shape[0]+X_oversample.shape[0])))
    
    #concatinating X_major and X_minor additionally y_major and y_minor
    X_new=pd.concat([X_major,X_minor,X_oversample],axis=0).reset_index(drop=True)
    y_new=pd.concat([y_major,y_minor],axis=0).reset_index(drop=True)
    y_new.name=y.name
    
    #concatinating the X_new and y_new so that i can can suffle the row along with their corresponding labels
    X_y=pd.concat([X_new,y_new],axis=1)
    
    #suffeling the dataframe rows
    X_y=X_y.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    #splitting the X_y dataframe and feature matrix and target variable 
    columns_list=X_y.columns.to_list()
    X = X_y[columns_list[:-1]]
    y = X_y[columns_list[-1]]
    

    return X,y
    

In [87]:
X_train_hb,y_train_hb=hybridsampling(X_train,y_train)

In [88]:
print("X_train shape: ",X_train.shape)
print("X_train_hb shape: ", X_train_hb.shape)

X_train shape:  (7595, 10)
X_train_hb shape:  (4873, 10)


### Naive Bayes

In [89]:
nb_pipe.fit(X_train_hb,y_train_hb)

In [90]:
print(classification_report(y_test,nb_pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.94      0.75      0.84      2125
           1       0.36      0.73      0.49       407

    accuracy                           0.75      2532
   macro avg       0.65      0.74      0.66      2532
weighted avg       0.84      0.75      0.78      2532



In [91]:
recall_hybrid.append(recall_score(y_test,nb_pipe.predict(X_test)).round(2))
f1_hybrid.append(f1_score(y_test,nb_pipe.predict(X_test)).round(2))
precision_hybrid.append(precision_score(y_test,nb_pipe.predict(X_test)).round(2))
accuracy_hybrid.append(accuracy_score(y_test,nb_pipe.predict(X_test)).round(2))

### Logistic Regression 

In [92]:
log_pipe.fit(X_train_hb,y_train_hb)

In [93]:
print(classification_report(y_test,log_pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90      2125
           1       0.51      0.77      0.61       407

    accuracy                           0.84      2532
   macro avg       0.73      0.81      0.76      2532
weighted avg       0.88      0.84      0.86      2532



In [94]:
recall_hybrid.append(recall_score(y_test,log_pipe.predict(X_test)).round(2))
f1_hybrid.append(f1_score(y_test,log_pipe.predict(X_test)).round(2))
precision_hybrid.append(precision_score(y_test,log_pipe.predict(X_test)).round(2))
accuracy_hybrid.append(accuracy_score(y_test,log_pipe.predict(X_test)).round(2))

### Decision Tree

In [95]:
dt_pipe.fit(X_train_hb,y_train_hb)

In [96]:
print(classification_report(y_test,dt_pipe.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.78      0.86      2125
           1       0.43      0.88      0.58       407

    accuracy                           0.79      2532
   macro avg       0.70      0.83      0.72      2532
weighted avg       0.88      0.79      0.82      2532



In [97]:
recall_hybrid.append(recall_score(y_test,dt_pipe.predict(X_test)).round(2))
f1_hybrid.append(f1_score(y_test,dt_pipe.predict(X_test)).round(2))
precision_hybrid.append(precision_score(y_test,dt_pipe.predict(X_test)).round(2))
accuracy_hybrid.append(accuracy_score(y_test,dt_pipe.predict(X_test)).round(2))

In [107]:
coefficients=log_pipe[-1].coef_.flatten()

In [108]:
feature_names=X.columns

In [109]:
# Calculate absolute coefficients for feature importance
abs_coefficients = np.abs(coefficients)

# Normalize coefficients to sum up to 1 for easier comparison
normalized_coefficients = abs_coefficients / np.sum(abs_coefficients)

# Create a dictionary to store feature importance
feature_importance = dict(zip(feature_names, normalized_coefficients))

# Print feature importance
for feature, importance in feature_importance.items():
    print(f'{feature}: {importance:.4f}')

Dependent_count: 0.0226
Total_Relationship_Count: 0.0856
Months_Inactive_12_mon: 0.0626
Contacts_Count_12_mon: 0.0727
Credit_Limit: 0.0921
Total_Revolving_Bal: 0.0244
Total_Trans_Amt: 0.1604
Total_Trans_Ct: 0.3307
Total_Ct_Chng_Q4_Q1: 0.0850
Avg_Utilization_Ratio: 0.0639


In [110]:
feature_importance

{'Dependent_count': 0.022607587750986223,
 'Total_Relationship_Count': 0.08557229064759762,
 'Months_Inactive_12_mon': 0.06256278298856927,
 'Contacts_Count_12_mon': 0.07267878912276768,
 'Credit_Limit': 0.09208515532316974,
 'Total_Revolving_Bal': 0.024437960150293103,
 'Total_Trans_Amt': 0.16042132866092357,
 'Total_Trans_Ct': 0.33067642562128013,
 'Total_Ct_Chng_Q4_Q1': 0.08503494337434923,
 'Avg_Utilization_Ratio': 0.06392273636006351}

In [111]:
pd.DataFrame(feature_importance)

ValueError: If using all scalar values, you must pass an index

In [106]:
coefficients

array([ 0.20703128, -0.78363693,  0.57292503,  0.66556338, -0.84327915,
       -0.2237931 ,  1.46907459, -3.02820291, -0.77871612, -0.58537894])

##  Creating Scores Tables

### Recall Table

In [129]:
recall_scores=pd.DataFrame(columns=["Algorithm","Basic","Undersampling","Oversampling","Hybrid"])
recall_scores["Algorithm"]=["Naive Bayes","Logistic Regression","Decision Tree"]
recall_scores.Basic=recall_basic
recall_scores.Undersampling=recall_under
recall_scores.Oversampling=recall_over
recall_scores.Hybrid=recall_hybrid

ValueError: Length of values (4) does not match length of index (3)

In [None]:
recall_scores

In [None]:
recall_scores.to_csv("recall_scores.csv")

In [None]:
fig=px.bar(recall_scores,x="Algorithm",y=recall_scores.columns,text_auto=True,barmode="group",
      title="Group Bar Plot of Recall Score before Downsample and After Downsample")
fig.update_layout(height=500,
                 width=1000,
                 title_font_size=24,
                 xaxis=dict(
                     title='Algorithms',
                     title_font=dict(size=20),  # Change the size of the x-axis label
                     tickfont=dict(size=18),    # Change the size of the x-axis tick labels
                 ),
                  yaxis=dict(
                      title='Recall',
                      title_font=dict(size=20),  # Change the size of the y-axis label
                      tickfont=dict(size=18),    # Change the size of the y-axis tick labels
                  ))
fig.write_image("recall.png")
fig.show()

### F1 Score Table

In [None]:
f1_scores=pd.DataFrame(columns=["Algorithm","Basic","Undersampling","Oversampling","Hybrid"])
f1_scores["Algorithm"]=["Naive Bayes","Logistic Regression","Decision Tree"]
f1_scores.Basic=f1_basic
f1_scores.Undersampling=f1_under
f1_scores.Oversampling=f1_over
f1_scores.Hybrid=f1_hybrid

In [None]:
f1_scores

In [None]:
f1_scores.to_csv("f1_scores.csv")

In [None]:
fig=px.bar(f1_scores,x="Algorithm",y=f1_scores.columns,text_auto=True,barmode="group",
      title="Group Bar Plot of Recall Score before Downsample and After Downsample")
fig.update_layout(height=500,
                 width=1000,
                 title_font_size=24,
                 xaxis=dict(
                     title='Algorithms',
                     title_font=dict(size=20),  # Change the size of the x-axis label
                     tickfont=dict(size=18),    # Change the size of the x-axis tick labels
                 ),
                  yaxis=dict(
                      title='f1 score',
                      title_font=dict(size=20),  # Change the size of the y-axis label
                      tickfont=dict(size=18),    # Change the size of the y-axis tick labels
                  ))
fig.write_image("f1.png")
fig.show()

### Precision Score Table

In [None]:
precision_scores=pd.DataFrame(columns=["Algorithm","Basic","Undersampling","Oversampling","Hybrid"])
precision_scores["Algorithm"]=["Naive Bayes","Logistic Regression","Decision Tree"]
precision_scores.Basic=precision_basic
precision_scores.Undersampling=precision_under
precision_scores.Oversampling=precision_over
precision_scores.Hybrid=precision_hybrid

In [None]:
precision_scores

In [None]:
precision_scores.to_csv("precision_scores.csv")

In [None]:
fig=px.bar(precision_scores,x="Algorithm",y=precision_scores.columns,text_auto=True,barmode="group",
      title="Group Bar Plot of Recall Score before Downsample and After Downsample")
fig.update_layout(height=500,
                 width=1000,
                 title_font_size=24,
                 xaxis=dict(
                     title='Algorithms',
                     title_font=dict(size=20),  # Change the size of the x-axis label
                     tickfont=dict(size=18),    # Change the size of the x-axis tick labels
                 ),
                  yaxis=dict(
                      title='Pricision',
                      title_font=dict(size=20),  # Change the size of the y-axis label
                      tickfont=dict(size=18),    # Change the size of the y-axis tick labels
                  ))
fig.write_image("precision.png")
fig.show()

### Accuracy Score Table

In [None]:
accuracy_scores=pd.DataFrame(columns=["Algorithm","Basic","Undersampling","Oversampling","Hybrid"])
accuracy_scores["Algorithm"]=["Naive Bayes","Logistic Regression","Decision Tree"]
accuracy_scores.Basic=accuracy_basic
accuracy_scores.Undersampling=accuracy_under
accuracy_scores.Oversampling=accuracy_over
accuracy_scores.Hybrid=accuracy_hybrid

In [None]:
accuracy_scores

In [None]:
accuracy_scores.to_csv("accuracy_scores.csv")

In [None]:
fig=px.bar(accuracy_scores,x="Algorithm",y=accuracy_scores.columns,text_auto=True,barmode="group",
      title="Group Bar Plot of Recall Score before Downsample and After Downsample")
fig.update_layout(height=500,
                 width=1000,
                 title_font_size=24,
                 xaxis=dict(
                     title='Algorithms',
                     title_font=dict(size=20),  # Change the size of the x-axis label
                     tickfont=dict(size=18),    # Change the size of the x-axis tick labels
                 ),
                  yaxis=dict(
                      title='Accuracy',
                      title_font=dict(size=20),  # Change the size of the y-axis label
                      tickfont=dict(size=18),    # Change the size of the y-axis tick labels
                  ))
fig.write_image("accuracy.png")
fig.show()

### Naive Bayes

In [None]:
naive_table=pd.DataFrame(columns=["Evaluation_metric","Basic","Undersampling","Oversampling","Hybridsampling"])
naive_table.Evaluation_metric=["recall","accuracy","precision","f1"]
naive_table.Basic=[recall_basic[0],accuracy_basic[0],precision_basic[0],f1_basic[0]]
naive_table.Undersampling=[recall_under[0],accuracy_under[0],precision_under[0],f1_under[0]]
naive_table.Oversampling=[recall_over[0],accuracy_over[0],precision_over[0],f1_over[0]]
naive_table.Hybridsampling=[recall_hybrid[0],accuracy_hybrid[0],precision_hybrid[0],f1_hybrid[0]]

In [None]:
naive_table.to_csv("naive.csv")

In [None]:
naive_table

In [None]:
fig=px.bar(naive_table,x="Evaluation_metric",y=naive_table.columns,text_auto=True,barmode="group",
      title="Group Bar Plot of Different Evaluation Metric for Naive Bayes under Different Scenario")
fig.update_layout(height=500,
                 width=1000,
                 title_font_size=20,
                 xaxis=dict(
                     title='Evslution Metrics',
                     title_font=dict(size=20),  # Change the size of the x-axis label
                     tickfont=dict(size=18),    # Change the size of the x-axis tick labels
                 ),
                  yaxis=dict(
                      title='Score',
                      title_font=dict(size=20),  # Change the size of the y-axis label
                      tickfont=dict(size=18),    # Change the size of the y-axis tick labels
                  ))
fig.write_image("naive.png")
fig.show()

### Logsitic Regression

In [None]:
logistic_table=pd.DataFrame(columns=["Evaluation_metric","Basic","Undersampling","Oversampling","Hybridsampling"])
logistic_table.Evaluation_metric=["recall","accuracy","precision","f1"]
logistic_table.Basic=[recall_basic[1],accuracy_basic[1],precision_basic[1],f1_basic[1]]
logistic_table.Undersampling=[recall_under[1],accuracy_under[1],precision_under[1],f1_under[1]]
logistic_table.Oversampling=[recall_over[1],accuracy_over[1],precision_over[1],f1_over[1]]
logistic_table.Hybridsampling=[recall_hybrid[1],accuracy_hybrid[1],precision_hybrid[1],f1_hybrid[1]]

In [None]:
logistic_table.to_csv("logistic.csv")

In [None]:
logistic_table

In [None]:
fig=px.bar(logistic_table,x="Evaluation_metric",y=logistic_table.columns,text_auto=True,barmode="group",
      title="Group Bar Plot of Different Evaluation Metric for Logistic Regression under Different Scenario")
fig.update_layout(height=500,
                 width=1000,
                 title_font_size=20,
                 xaxis=dict(
                     title='Evslution Metrics',
                     title_font=dict(size=20),  # Change the size of the x-axis label
                     tickfont=dict(size=18),    # Change the size of the x-axis tick labels
                 ),
                  yaxis=dict(
                      title='Score',
                      title_font=dict(size=20),  # Change the size of the y-axis label
                      tickfont=dict(size=18),    # Change the size of the y-axis tick labels
                  ))
fig.write_image("logistic.png")
fig.show()

### Decision Tree

In [None]:
dt_table=pd.DataFrame(columns=["Evaluation_metric","Basic","Undersampling","Oversampling","Hybridsampling"])
dt_table.Evaluation_metric=["recall","accuracy","precision","f1"]
dt_table.Basic=[recall_basic[2],accuracy_basic[2],precision_basic[2],f1_basic[2]]
dt_table.Undersampling=[recall_under[2],accuracy_under[2],precision_under[2],f1_under[2]]
dt_table.Oversampling=[recall_over[2],accuracy_over[2],precision_over[2],f1_over[2]]
dt_table.Hybridsampling=[recall_hybrid[2],accuracy_hybrid[2],precision_hybrid[2],f1_hybrid[2]]

In [None]:
dt_table.to_csv("dt.csv")

In [None]:
dt_table

In [None]:
fig=px.bar(dt_table,x="Evaluation_metric",y=logistic_table.columns,text_auto=True,barmode="group",
      title="Group Bar Plot of Different Evaluation Metric for Decision Tree under Different Scenario")
fig.update_layout(height=500,
                 width=1000,
                 title_font_size=20,
                 xaxis=dict(
                     title='Evslution Metrics',
                     title_font=dict(size=20),  # Change the size of the x-axis label
                     tickfont=dict(size=18),    # Change the size of the x-axis tick labels
                 ),
                  yaxis=dict(
                      title='Score',
                      title_font=dict(size=20),  # Change the size of the y-axis label
                      tickfont=dict(size=18),    # Change the size of the y-axis tick labels
                  ))
fig.write_image("decision.png")
fig.show()

### Basic

In [None]:
basic_table=pd.DataFrame(columns=["Evaluation_Metric","Naive_bayes","Logistic_regression","Decision_tree"])
basic_table.Evaluation_Metric=["recall","accuracy","precision","f1"]
basic_table.Naive_bayes=[recall_basic[0],accuracy_basic[0],precision_basic[0],f1_basic[0]]
basic_table.Logistic_regression=[recall_basic[1],accuracy_basic[1],precision_basic[1],f1_basic[1]]
basic_table.Decision_tree=[recall_basic[2],accuracy_basic[2],precision_basic[2],f1_basic[2]]
basic_table.Random_forest=[recall_basic[3],accuracy_basic[3],precision_basic[3],f1_basic[3]]
basic_table

In [None]:
basic_table.to_csv("basic_table.csv")

In [None]:
fig=px.bar(basic_table,x="Evaluation_Metric",y=basic_table.columns,text_auto=True,barmode="group",
      title="Comparision of Different Evaluation Metric for Different Models")
fig.update_layout(height=500,
                 width=1000,
                 title_font_size=20,
                 xaxis=dict(
                     title='Evalution Metrics',
                     title_font=dict(size=20),  # Change the size of the x-axis label
                     tickfont=dict(size=18),    # Change the size of the x-axis tick labels
                 ),
                  yaxis=dict(
                      title='Score',
                      title_font=dict(size=20),  # Change the size of the y-axis label
                      tickfont=dict(size=18),    # Change the size of the y-axis tick labels
                  ))
fig.write_image("basic_table.png")
fig.show()

### Undersampling

In [None]:
undersampling_table=pd.DataFrame(columns=["Evaluation_Metric","Naive_bayes","Logistic_regression","Decision_tree"])
undersampling_table.Evaluation_Metric=["recall","accuracy","precision","f1"]
undersampling_table.Naive_bayes=[recall_under[0],accuracy_under[0],precision_under[0],f1_under[0]]
undersampling_table.Logistic_regression=[recall_under[1],accuracy_under[1],precision_under[1],f1_under[1]]
undersampling_table.Decision_tree=[recall_under[2],accuracy_under[2],precision_under[2],f1_under[2]]
undersampling_table.Random_forest=[recall_under[3],accuracy_under[3],precision_under[3],f1_under[3]]
undersampling_table

In [None]:
undersampling_table.to_csv("undersampling_table.csv")

In [None]:
fig=px.bar(undersampling_table,x="Evaluation_Metric",y=undersampling_table.columns,text_auto=True,barmode="group",
      title="Comparision of Different Evaluation Metric for Different Models (Undersampling)")
fig.update_layout(height=500,
                 width=1000,
                 title_font_size=20,
                 xaxis=dict(
                     title='Evalution Metrics',
                     title_font=dict(size=20),  # Change the size of the x-axis label
                     tickfont=dict(size=18),    # Change the size of the x-axis tick labels
                 ),
                  yaxis=dict(
                      title='Score',
                      title_font=dict(size=20),  # Change the size of the y-axis label
                      tickfont=dict(size=18),    # Change the size of the y-axis tick labels
                  ))
fig.write_image("undersampling_table.png")
fig.show()

### Over Sampling

In [None]:
oversampling_table=pd.DataFrame(columns=["Evaluation_Metric","Naive_bayes","Logistic_regression","Decision_tree"])
oversampling_table.Evaluation_Metric=["recall","accuracy","precision","f1"]
oversampling_table.Naive_bayes=[recall_over[0],accuracy_over[0],precision_over[0],f1_over[0]]
oversampling_table.Logistic_regression=[recall_over[1],accuracy_over[1],precision_over[1],f1_over[1]]
oversampling_table.Decision_tree=[recall_over[2],accuracy_over[2],precision_over[2],f1_over[2]]
oversampling_table.Random_forest=[recall_over[3],accuracy_over[3],precision_over[3],f1_over[3]]
oversampling_table

In [None]:
oversampling_table.to_csv("oversampling_table.csv")

In [None]:
fig=px.bar(oversampling_table,x="Evaluation_Metric",y=oversampling_table.columns,text_auto=True,barmode="group",
      title="Comparision of Different Evaluation Metric for Different Models (Oversampling)")
fig.update_layout(height=500,
                 width=1000,
                 title_font_size=20,
                 xaxis=dict(
                     title='Evalution Metrics',
                     title_font=dict(size=20),  # Change the size of the x-axis label
                     tickfont=dict(size=18),    # Change the size of the x-axis tick labels
                 ),
                  yaxis=dict(
                      title='Score',
                      title_font=dict(size=20),  # Change the size of the y-axis label
                      tickfont=dict(size=18),    # Change the size of the y-axis tick labels
                  ))
fig.write_image("oversampling_table.png")
fig.show()

### Hybrid Sampling

In [None]:
hybridsampling_table=pd.DataFrame(columns=["Evaluation_Metric","Naive_bayes","Logistic_regression","Decision_tree"])
hybridsampling_table.Evaluation_Metric=["recall","accuracy","precision","f1"]
hybridsampling_table.Naive_bayes=[recall_hybrid[0],accuracy_hybrid[0],precision_hybrid[0],f1_hybrid[0]]
hybridsampling_table.Logistic_regression=[recall_hybrid[1],accuracy_hybrid[1],precision_hybrid[1],f1_hybrid[1]]
hybridsampling_table.Decision_tree=[recall_hybrid[2],accuracy_hybrid[2],precision_hybrid[2],f1_hybrid[2]]
hybridsampling_table.Random_forest=[recall_hybrid[3],accuracy_hybrid[3],precision_hybrid[3],f1_hybrid[3]]
hybridsampling_table

In [None]:
hybridsampling_table.to_csv("hybridsampling_table.csv")

In [None]:
fig=px.bar(hybridsampling_table,x="Evaluation_Metric",y=hybridsampling_table.columns,text_auto=True,barmode="group",
      title="Comparision of Different Evaluation Metric for Different Models (Hybridsampling)")
fig.update_layout(height=500,
                 width=1000,
                 title_font_size=20,
                 xaxis=dict(
                     title='Evalution Metrics',
                     title_font=dict(size=20),  # Change the size of the x-axis label
                     tickfont=dict(size=18),    # Change the size of the x-axis tick labels
                 ),
                  yaxis=dict(
                      title='Score',
                      title_font=dict(size=20),  # Change the size of the y-axis label
                      tickfont=dict(size=18),    # Change the size of the y-axis tick labels
                  ))
fig.write_image("hybridsampling_table.png")
fig.show()

#### Under sampling

def undersampling(X,y):
    
    #resetting the index of both X and y
    X.reset_index(drop=True,inplace=True)
    y.reset_index(drop=True,inplace=True)

    # creating a dataframe of counts corresponding to each category
    temp_df=y.value_counts().reset_index()
    temp_df.columns=[y.name,"Count"]
    
    #finding the majority label, minority label, majority count minority count
    majority_label=temp_df[y.name][temp_df.Count.idxmax()]
    majority_count=temp_df["Count"][temp_df.Count.idxmax()]
    minority_label=temp_df[y.name][temp_df.Count.idxmin()]
    minority_count=temp_df["Count"][temp_df.Count.idxmax()]
    
    #creating subsets of dataset based on the majority label and minority label
    X_minor=X[y==minority_label].reset_index(drop=True)
    X_major=X[y==majority_label].reset_index(drop=True)
    
    #finding the mean of majority category
    mean=X_major.mean(axis=0).values
    
    #calculating distance of every minority class point from the mean of majority class
    distance=np.array([])
    for i in range(X_minor.shape[0]):
        d=np.sqrt(sum((X_minor.iloc[i].values-mean)**2))
        distance=np.append(distance,d)
    #finding the indices of minority points according to the distance (in ascending order)
    minority_indices=np.argsort(distance)
    
    #reordering the X_minor according to the minority index
    X_minor=X_minor.iloc[minority_indices]
    
    # finding number of points to be dropped
    n= np.int16(X_major.shape[0]/X_minor.shape[0])
    
    # Find the indices to drop from X_major based on the 4 closest majority points for each minority point
    for i in range(X_minor.shape[0]):
        while(X_major.shape[0]>X_minor.shape[0]):
            distance=np.array([])
            for j in range(X_major.shape[0]):
                d=np.sqrt(sum((X_major.iloc[j].values-X_minor.iloc[i].values)**2))
                distance=np.append(distance,d)

            indices=np.argsort(distance)[0:n]

            # Drop the 4 points from X_major for this iteration
            X_major.drop(index=indices, axis=0, inplace=True)
            X_major.reset_index(drop=True,inplace=True)
    
    #Creating new pandas series of the size of X_major and X_minor 
    y_major=pd.Series(np.repeat(majority_label,X_major.shape[0]))
    y_minor=pd.Series(np.repeat(minority_label,X_minor.shape[0]))
    
    #concatinating X_major and X_minor additionally y_major and y_minor
    X_new=pd.concat([X_major,X_minor],axis=0).reset_index(drop=True)
    y_new=pd.concat([y_major,y_minor],axis=0).reset_index(drop=True)
    y_new.name=y.name
    
    #concatinating the X_new and y_new so that i can can suffle the row along with their corresponding labels
    X_y=pd.concat([X_new,y_new],axis=1)
    
    #suffeling the dataframe rows
    X_y=X_y.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    #splitting the X_y dataframe and feature matrix and target variable 
    columns_list=X_y.columns.to_list()
    X = X_y[columns_list[:-1]]
    y = X_y[columns_list[-1]]
    

    return X,y

def undersampling(X,y):
    
    #resetting the index of both X and y
    X.reset_index(drop=True,inplace=True)
    y.reset_index(drop=True,inplace=True)

    # creating a dataframe of counts corresponding to each category
    temp_df=y.value_counts().reset_index()
    temp_df.columns=[y.name,"Count"]
    
    #finding the majority label, minority label, majority count minority count
    majority_label=temp_df[y.name][temp_df.Count.idxmax()]
    majority_count=temp_df["Count"][temp_df.Count.idxmax()]
    minority_label=temp_df[y.name][temp_df.Count.idxmin()]
    minority_count=temp_df["Count"][temp_df.Count.idxmax()]
    
    #creating subsets of dataset based on the majority label and minority label
    X_minor=X[y==minority_label].reset_index(drop=True)
    X_major=X[y==majority_label].reset_index(drop=True)
    
    #finding the mean of majority category
    mean=X_major.mean(axis=0).values
    
    #calculating distance of every minority class point from the mean of majority class
    distance=np.array([])
    for i in range(X_minor.shape[0]):
        d=np.sqrt(sum((X_minor.iloc[i].values-mean)**2))
        distance=np.append(distance,d)
    #finding the indices of minority points according to the distance (in ascending order)
    minority_indices=np.argsort(distance)
    
    #reordering the X_minor according to the minority index
    X_minor=X_minor.iloc[minority_indices]
    
    # finding number of points to be dropped
    n= np.int16(X_major.shape[0]/X_minor.shape[0])
    
    # Find the indices to drop from X_major based on the 4 closest majority points for each minority point
    for i in range(X_minor.shape[0]):
        distance=np.array([])
        while(X_major.shape[0]>X_minor.shape[0]):
            for j in range(X_major.shape[0]):
                d=np.sqrt(sum((X_major.iloc[j].values-X_minor.iloc[i].values)**2))
                distance=np.append(distance,d)

            indices=np.argsort(distance)[0:n]

            # Drop the 4 points from X_major for this iteration
            X_major.drop(index=indices, axis=0, inplace=True)
            X_major.reset_index(drop=True,inplace=True)
    
    #Creating new pandas series of the size of X_major and X_minor 
    y_major=pd.Series(np.repeat(majority_label,X_major.shape[0]))
    y_minor=pd.Series(np.repeat(minority_label,X_minor.shape[0]))
    
    #concatinating X_major and X_minor additionally y_major and y_minor
    X_new=pd.concat([X_major,X_minor],axis=0).reset_index(drop=True)
    y_new=pd.concat([y_major,y_minor],axis=0).reset_index(drop=True)
    y_new.name=y.name
    
    #concatinating the X_new and y_new so that i can can suffle the row along with their corresponding labels
    X_y=pd.concat([X_new,y_new],axis=1)
    
    #suffeling the dataframe rows
    X_y=X_y.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    #splitting the X_y dataframe and feature matrix and target variable 
    columns_list=X_y.columns.to_list()
    X = X_y[columns_list[:-1]]
    y = X_y[columns_list[-1]]
    

    return X,y

import numpy as np
import pandas as pd

def undersampling(X, y):
    
    # resetting the index of X and y both
    X.reset_index(drop=True,inplace=True)
    y.reset_index(drop=True,inplace=True)
    
    # Finding the majority and minority class labels
    majority_label = y.value_counts().idxmax()
    minority_label = y.value_counts().idxmin()

    # Splitting the dataset into majority and minority subsets
    X_major = X[y == majority_label]
    X_minor = X[y == minority_label]

    # Finding the mean of the majority class
    mean = X_major.mean(axis=0)

    # Calculating the distances of minority points from the mean of the majority class
    distances = np.sqrt(((X_minor - mean) ** 2).sum(axis=1))

    # Sorting the minority points based on distances (in ascending order)
    X_minor_sorted = X_minor.iloc[np.argsort(distances)]

    # Create a list to store the indices of points to be dropped from X_major
    indices_to_drop = []

    # Iterate over each minority point
    for i in range(X_minor_sorted.shape[0]):
        distances = np.sqrt(((X_major - X_minor_sorted.iloc[i]) ** 2).sum(axis=1))
        # Find the indices of the 4 closest majority points
        indices = distances.argsort()[:4]
        indices_to_drop.extend(indices)

    # Drop the points from X_major
    X_major = X_major.drop(index=indices_to_drop)

    # Creating new series for y_major and y_minor
    y_major = pd.Series(np.repeat(majority_label, X_major.shape[0]))
    y_minor = pd.Series(np.repeat(minority_label, X_minor_sorted.shape[0]))

    # Concatenate the majority and minority dataframes
    X_new = pd.concat([X_major, X_minor_sorted], axis=0).reset_index(drop=True)
    y_new = pd.concat([y_major, y_minor], axis=0).reset_index(drop=True)

    # Create a new dataframe by concatenating X_new and y_new
    X_y = pd.concat([X_new, y_new], axis=1)
    
    # Shuffle the rows of the dataframe
    X_y = X_y.sample(frac=1.0, random_state=42).reset_index(drop=True)

    # Splitting X_y back into X and y
    X = X_y.iloc[:, :-1]
    y = X_y.iloc[:, -1]

    return X, y


def undersampling(X,y):
    
    #resetting the index of both X and y
    X.reset_index(drop=True,inplace=True)
    y.reset_index(drop=True,inplace=True)

    # creating a dataframe of counts corresponding to each category
    temp_df=y.value_counts().reset_index()
    temp_df.columns=[y.name,"Count"]
    
    #finding the majority label, minority label, majority count minority count
    majority_label=temp_df[y.name][temp_df.Count.idxmax()]
    majority_count=temp_df["Count"][temp_df.Count.idxmax()]
    minority_label=temp_df[y.name][temp_df.Count.idxmin()]
    minority_count=temp_df["Count"][temp_df.Count.idxmax()]
    
    #creating subsets of dataset based on the majority label and minority label
    X_minor=X[y==minority_label].reset_index(drop=True)
    X_major=X[y==majority_label].reset_index(drop=True)
    
    #finding the mean of majority category
    mean=X_major.mean(axis=0).values
    
    #calculating distance of every minority class point from the mean of majority class
    distance=np.array([])
    for i in range(X_minor.shape[0]):
        d=np.sqrt(sum((X_minor.iloc[i].values-mean)**2))
        distance=np.append(distance,d)
    #finding the indices of minority points according to the distance (in ascending order)
    minority_indices=np.argsort(distance)
    
    #reordering the X_minor according to the minority index
    X_minor=X_minor.iloc[minority_indices]
    
    # Find the indices to drop from X_major based on the 4 closest majority points for each minority point
    for i in range(X_minor.shape[0]):
        distance=np.array([])
        for j in range(X_major.shape[0]):
            d=np.sqrt(sum((X_major.iloc[j].values-X_minor.iloc[i].values)**2))
            distance=np.append(distance,d)
        
        indices=np.argsort(distance)[0:4]
    
        # Drop the 4 points from X_major for this iteration
        X_major.drop(index=indices, axis=0, inplace=True)
        X_major.reset_index(drop=True,inplace=True)
    
    #Creating new pandas series of the size of X_major and X_minor 
    y_major=pd.Series(np.repeat(majority_label,X_major.shape[0]))
    y_minor=pd.Series(np.repeat(minority_label,X_minor.shape[0]))
    
    #concatinating X_major and X_minor additionally y_major and y_minor
    X_new=pd.concat([X_major,X_minor],axis=0).reset_index(drop=True)
    y_new=pd.concat([y_major,y_minor],axis=0).reset_index(drop=True)
    y_new.name=y.name
    
    #concatinating the X_new and y_new so that i can can suffle the row along with their corresponding labels
    X_y=pd.concat([X_new,y_new],axis=1)
    
    #suffeling the dataframe rows
    X_y=X_y.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    #splitting the X_y dataframe and feature matrix and target variable 
    columns_list=X_y.columns.to_list()
    X = X_y[columns_list[:-1]]
    y = X_y[columns_list[-1]]
    

    return X,y
    
    

def undersampling(X,y,us_ratio=0.5,os_ratio=1,n=4,m=4):
    
    #resetting the index of both X and y
    X.reset_index(drop=True,inplace=True)
    y.reset_index(drop=True,inplace=True)

    # creating a dataframe of counts corresponding to each category
    temp_df=y.value_counts().reset_index()
    temp_df.columns=[y.name,"Count"]
    
    #finding the majority label, minority label, majority count minority count
    majority_label=temp_df[y.name][temp_df.Count.idxmax()]
    majority_count=temp_df["Count"][temp_df.Count.idxmax()]
    minority_label=temp_df[y.name][temp_df.Count.idxmin()]
    minority_count=temp_df["Count"][temp_df.Count.idxmax()]
    
    #creating subsets of dataset based on the majority label and minority label
    X_minor=X[y==minority_label].reset_index(drop=True)
    X_major=X[y==majority_label].reset_index(drop=True)
    
    # Oversampling
    X_oversample=pd.DataFrame(columns=X_minor.columns)
    
    # Number of rows to calculate the mean
    window_size = 4

    # Loop through the original DataFrame and calculate the mean for each window
    for i in range(len(X_minor) - window_size + 1):
        window = X_minor.iloc[i:i+window_size]
        mean_row = window.mean()
        X_oversample = X_oversample.append(mean_row, ignore_index=True)
        
    # UnderSample
    
    #finding the mean of majority category
    mean=X_major.mean(axis=0).values
    
    #calculating distance of every minority class point from the mean of majority class
    distance=np.array([])
    for i in range(X_minor.shape[0]):
        d=np.sqrt(sum((X_minor.iloc[i].values-mean)**2))
        distance=np.append(distance,d)
    #finding the indices of minority points according to the distance (in ascending order)
    minority_indices=np.argsort(distance)
    
    #reordering the X_minor according to the minority index
    X_minor=X_minor.iloc[minority_indices]
    
    # finding number of points to be dropped
    #n=np.int16(X_major.shape[0]/X_minor.shape[0])
    
    #n_under=int(us_ratio*len(X_minor))
    #X_under=X_minor.iloc[:n_under,:]
    
    
    # Find the indices to drop from X_major based on the 4 closest majority points for each minority point
    for i in range(X_minor.shape[0]):
        while(X_major.shape[0]>(X_minor.shape[0]+X_oversample.shape[0])):
            distance=np.array([])
            for j in range(X_major.shape[0]):
                d=np.sqrt(sum((X_major.iloc[j].values-X_minor.iloc[i].values)**2))
                distance=np.append(distance,d)

            indices=np.argsort(distance)[0:n]

            # Drop the 4 points from X_major for this iteration
            X_major.drop(index=indices, axis=0, inplace=True)
            X_major.reset_index(drop=True,inplace=True)
    
    #Creating new pandas series of the size of X_major and X_minor 
    y_major=pd.Series(np.repeat(majority_label,X_major.shape[0]))
    y_minor=pd.Series(np.repeat(minority_label,(X_minor.shape[0]+X_oversample.shape[0])))
    
    #concatinating X_major and X_minor additionally y_major and y_minor
    X_new=pd.concat([X_major,X_minor,X_oversample],axis=0).reset_index(drop=True)
    y_new=pd.concat([y_major,y_minor],axis=0).reset_index(drop=True)
    y_new.name=y.name
    
    #concatinating the X_new and y_new so that i can can suffle the row along with their corresponding labels
    X_y=pd.concat([X_new,y_new],axis=1)
    
    #suffeling the dataframe rows
    X_y=X_y.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    #splitting the X_y dataframe and feature matrix and target variable 
    columns_list=X_y.columns.to_list()
    X = X_y[columns_list[:-1]]
    y = X_y[columns_list[-1]]
    

    return X,y
    