In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

# NLP Module
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
import string
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Classification model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,ConfusionMatrixDisplay,\
                            precision_score,recall_score,f1_score,roc_auc_score, roc_curve
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Preprocessing models
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders.binary import BinaryEncoder
from imblearn.combine import SMOTETomek

# Hyperparameters tuning models
from hyperopt import tpe,hp,Trials,space_eval
from hyperopt.fmin import fmin
from hyperopt.pyll import scope

#### Reading a CSV file using the pd.read_csv() function. 
* However, the file path contains backslashes (\) which can cause issues. It's recommended to either use forward slashes (/) or escape the backslashes (\\) in the file path to avoid any problems.

In [48]:
df = pd.read_csv('Complaints.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.shape

(768358, 18)

#### Calculates the percentage of missing values in each column of the DataFrame df and sorts them in descending order.

In [49]:
missing = df.isnull().sum().div(df.shape[0]).mul(100).to_frame().sort_values(by=0, ascending=False)
missing

Unnamed: 0,0
Tags,85.881191
Consumer complaint narrative,78.651358
Company public response,74.528931
Consumer consent provided?,61.23721
Sub-issue,59.265342
Sub-product,30.605525
State,0.735334
ZIP code,0.50198
Date sent to company,0.0
Consumer disputed?,0.0


#### Drop unnecessary columns from the DataFrame

In [50]:
df.columns
drop_columns = ['Tags','Consumer complaint narrative','Company public response',
                'Consumer consent provided?','Sub-issue','Sub-product','Complaint ID','ZIP code']
df.drop(drop_columns, axis=1,inplace=True)

## Feature extraction

In [51]:
df[['Date received','Date sent to company']].head()

Unnamed: 0,Date received,Date sent to company
0,2015-01-04,2015-01-04
1,2013-09-04,2013-09-03
2,2014-06-10,2014-06-10
3,2014-01-08,2014-01-08
4,2014-09-11,2014-09-18


#### Calculate the number of days it took to forward a complaint

In [52]:
df['days_to_forward_complaint'] = pd.to_datetime(df['Date sent to company'])-pd.to_datetime(df['Date received'])
df['days_to_forward_complaint'] = df['days_to_forward_complaint'].dt.days
df.head(3)

Unnamed: 0,Date received,Product,Issue,Company,State,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,days_to_forward_complaint
0,2015-01-04,Credit reporting,Incorrect information on credit report,Experian Information Solutions Inc.,TX,Web,2015-01-04,Closed with explanation,Yes,No,0
1,2013-09-04,Credit card,Credit determination,"CITIBANK, N.A.",AZ,Web,2013-09-03,Closed with explanation,Yes,No,-1
2,2014-06-10,Debt collection,Disclosure verification of debt,SYNCHRONY FINANCIAL,CO,Web,2014-06-10,Closed with explanation,Yes,No,0


In [53]:
df.drop(['Date received','Date sent to company'], axis=1, inplace=True)

#### For models to reduce computation time we can use sample of the data for the model

In [54]:
df1 = df.groupby('Consumer disputed?').sample(n=50000)
df1.reset_index(inplace=True)
df1.shape

(100000, 10)

## Text processing

##### For Vectorization
* TFIDF
* CountVectorizer
* NLTK/Scipy Library
* Pretrained Glove

##### Steps for text processing
* Remove punctuation
* Remove Stop Words
* Lower Casing
* Tokenization
* Stemming/Lemmatization


##### Note
* `Issue` column has text which has to be preprocessed.
* The text need to be transformed into vectors as the algorithm will be able to make predictions. In this case, it will be used the Term Frequency-Inverse Document Frequency (TFIDF) weight to evaluate how import a word is to a document in a collection of documents.
* After removing the punctuation and lower casing the words, the importance of the word is determined in terms of the frequency.

In [55]:
# list of stopword which will be remmoved
stopwords_list = stopwords.words('english')+list(string.punctuation)

* In the `process_text` function, you can improve the readability of the code by using a list comprehension instead of two separate loops. This will make the code more concise. 

* In the `concat_strings` function, you can use the `join()` method to concatenate the `words` in the list. This method is more efficient than concatenating strings using the `+` operator inside a loop.

* In the `lemmatizer_concat` function, there is a small issue. The lemmatization is being applied to the original `words_list` instead of the filtered `list_of_words`. Replace `words_list` with `list_of_words` in the for loop. You can try that.

In [56]:
from typing import List

def process_text(text: str) -> List[str]:
    """
    Tokenizes the input text, removes stopwords and non-alphabetic words.

    Args:
        text (str): The input text to be processed.

    Returns:
        list: A list of processed tokens.
    """

    tokens = nltk.word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    stopwords_removed = [word for word in stopwords_removed if word.isalpha()]

    return stopwords_removed

def concat_strings(words: List[str]) -> str:
    """
    Concatenates a list of words into a single string.

    Args:
        words (list): The list of words to be concatenated.

    Returns:
        str: The concatenated string.
    """

    return ' '.join(words)

lemmatizer = WordNetLemmatizer()

def lemmatizer_concat(words: List[str]) -> str:
    """
    Lemmatizes each word in the given list and concatenates them into a single string.

    Args:
        words (list): The list of words to be lemmatized and concatenated.

    Returns:
        str: The lemmatized and concatenated string.
    """

    words = [word for word in words if word is not np.nan]
    lemmatized_list = []
    for word in words:
        lemmatized_list.append(lemmatizer.lemmatize(word))

    return concat_strings(lemmatized_list)

#### Prepare data with text processing


In [59]:
nltk.download('punkt')
nltk.download('wordnet')

for i in range(len(df1)):
    
    # Iterate through all the rows and extract each 'Issue'
    text = process_text(df1['Issue'].loc[i])
    final_texts = lemmatizer_concat(text)
    
    # Change the 'Issue' column into the processed text
    df1['Issue'].loc[i] = final_texts
    if i % 5000 == 0:
        print(f'Processed Row Number {i}')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sheip\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sheip\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Issue'].loc[i] = final_texts


Processed Row Number 0
Processed Row Number 5000
Processed Row Number 10000
Processed Row Number 15000
Processed Row Number 20000
Processed Row Number 25000
Processed Row Number 30000
Processed Row Number 35000
Processed Row Number 40000
Processed Row Number 45000
Processed Row Number 50000
Processed Row Number 55000
Processed Row Number 60000
Processed Row Number 65000
Processed Row Number 70000
Processed Row Number 75000
Processed Row Number 80000
Processed Row Number 85000
Processed Row Number 90000
Processed Row Number 95000


#### Using the TfidfVectorizer from scikit-learn to transform the 'Issue' column of the DataFrame df1 into a matrix of TF-IDF features.

In [60]:
# Import the necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer object
tfidv = TfidfVectorizer(max_features=None, strip_accents='unicode', analyzer='word', ngram_range=(1, 2))

# Fit and transform the 'Issue' column of DataFrame 'df1'
df_vect = tfidv.fit_transform(df1['Issue'])

# Get the feature names
feature_names = tfidv.get_feature_names_out()

### Data Processing

Concat old data withvectorized data from Issue column

In [61]:
df1 = pd.concat([df1, pd.DataFrame(df_vect.toarray())], axis=1)

In [62]:
df1.drop(['Issue','index'], axis=1, inplace=True)

In [63]:
df1

Unnamed: 0,Product,Company,State,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,days_to_forward_complaint,0,1,...,299,300,301,302,303,304,305,306,307,308
0,Mortgage,"BANK OF AMERICA, NATIONAL ASSOCIATION",AZ,Fax,Closed with explanation,Yes,No,7,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,Mortgage,FIFTH THIRD FINANCIAL CORPORATION,MI,Web,Closed with non-monetary relief,Yes,No,0,0.299569,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,Debt collection,"Credit Protection Association, L.P.",TX,Phone,Closed with non-monetary relief,Yes,No,6,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,Debt collection,"Pinnacle Credit Services, LLC",NC,Web,Closed with non-monetary relief,Yes,No,0,0.000000,0.0,...,0.0,0.0,0.469132,0.469132,0.0,0.0,0.0,0.0,0.0,0.0
4,Mortgage,Ocwen Financial Corporation,IL,Referral,Closed with non-monetary relief,Yes,No,5,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Mortgage,"LD Holdings Group, LLC",CA,Web,Closed with explanation,Yes,Yes,3,0.299569,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99996,Mortgage,"Dovenmuehle Mortgage, Inc.",TX,Web,Closed with explanation,Yes,Yes,0,0.299569,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99997,Credit card,HSBC NORTH AMERICA HOLDINGS INC.,VT,Web,Closed with explanation,Yes,Yes,0,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99998,Credit card,WELLS FARGO & COMPANY,FL,Web,Closed with explanation,No,Yes,0,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 317 entries, Product to 308
dtypes: float64(309), int64(1), object(7)
memory usage: 241.9+ MB


In [65]:
# Import the necessary libraries
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target variable (y)
X = df1.drop(['Consumer disputed?'], axis=1)
y = df1['Consumer disputed?']


In [66]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 316 entries, Product to 308
dtypes: float64(309), int64(1), object(6)
memory usage: 241.1+ MB


In [67]:
X.shape, y.shape

((100000, 316), (100000,))

In [68]:
X.columns

Index([                     'Product',                      'Company',
                              'State',                'Submitted via',
       'Company response to consumer',             'Timely response?',
          'days_to_forward_complaint',                              0,
                                    1,                              2,
       ...
                                  299,                            300,
                                  301,                            302,
                                  303,                            304,
                                  305,                            306,
                                  307,                            308],
      dtype='object', length=316)

Intialize feature for transformation

In [69]:
X.columns = X.columns.astype(str)

In [70]:
X['Company'].dtype

dtype('O')

-----------------------------

In [71]:
x_columnms = ['Product','Company','State','Submitted via',
            'Company response to consumer','Timely response?',
            'days_to_forward_complaint' ]

X_col = ['Product','Company','State','Submitted via',
            'Company response to consumer','Timely response?']

for i in x_columnms:
    print(i,X[i].dtype)
    
X_test = X[X_col]
X_test

Product object
Company object
State object
Submitted via object
Company response to consumer object
Timely response? object
days_to_forward_complaint int64


Unnamed: 0,Product,Company,State,Submitted via,Company response to consumer,Timely response?
0,Mortgage,"BANK OF AMERICA, NATIONAL ASSOCIATION",AZ,Fax,Closed with explanation,Yes
1,Mortgage,FIFTH THIRD FINANCIAL CORPORATION,MI,Web,Closed with non-monetary relief,Yes
2,Debt collection,"Credit Protection Association, L.P.",TX,Phone,Closed with non-monetary relief,Yes
3,Debt collection,"Pinnacle Credit Services, LLC",NC,Web,Closed with non-monetary relief,Yes
4,Mortgage,Ocwen Financial Corporation,IL,Referral,Closed with non-monetary relief,Yes
...,...,...,...,...,...,...
99995,Mortgage,"LD Holdings Group, LLC",CA,Web,Closed with explanation,Yes
99996,Mortgage,"Dovenmuehle Mortgage, Inc.",TX,Web,Closed with explanation,Yes
99997,Credit card,HSBC NORTH AMERICA HOLDINGS INC.,VT,Web,Closed with explanation,Yes
99998,Credit card,WELLS FARGO & COMPANY,FL,Web,Closed with explanation,No


In [72]:
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(X_test[['Timely response?']])
pd.DataFrame(encoded_data.toarray())


Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
99995,0.0,1.0
99996,0.0,1.0
99997,0.0,1.0
99998,1.0,0.0


In [73]:
from category_encoders import BinaryEncoder

encoder = BinaryEncoder()

df_encoded = encoder.fit_transform(X_test[['Product','Timely response?','Submitted via','State','Company response to consumer','Company']])
pd.DataFrame(df_encoded).astype('float64').dtypes

Product_0                         float64
Product_1                         float64
Product_2                         float64
Product_3                         float64
Timely response?_0                float64
Timely response?_1                float64
Submitted via_0                   float64
Submitted via_1                   float64
Submitted via_2                   float64
State_0                           float64
State_1                           float64
State_2                           float64
State_3                           float64
State_4                           float64
State_5                           float64
Company response to consumer_0    float64
Company response to consumer_1    float64
Company response to consumer_2    float64
Company_0                         float64
Company_1                         float64
Company_2                         float64
Company_3                         float64
Company_4                         float64
Company_5                         

------------------------

In [74]:
# List of binary features for encoding
binary_features = ['Product','Timely response?','Submitted via','State','Company response to consumer','Company']

Create column tansformer for transformation

In [75]:
binary_encoder_pipeline = Pipeline(steps=[
    ('SimpleImputer', SimpleImputer(strategy='most_frequent')),
    ('BinaryEncoder', BinaryEncoder())                               
])

preprocessor = ColumnTransformer(
    [
        ('binary_encoder_pipeline', binary_encoder_pipeline, binary_features)
    ]
,remainder='passthrough'
)

In [76]:
X = preprocessor.fit_transform(X)

In [77]:
X_df = pd.DataFrame(X)
X_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,330,331,332,333,334,335,336,337,338,339
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.469132,0.469132,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99996,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99997,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99998,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
np.array(X).dtype

dtype('float64')

In [79]:
X_df.columns

RangeIndex(start=0, stop=340, step=1)

In [80]:
y = np.where(y.values == 'Yes', 0, 1)

In [81]:
y

array([1, 1, 1, ..., 0, 0, 0])

## Handling Imbalanced Dataset

* Synthetic Minority oversampling Technique or SMOTE is another technique to oversample the minority class, duplicate the minority dataset.

* SMOTE is one of the famous oversampling techniques and is very effective in handling class imbalance. Combine SMOTE to some undersampling technique(ENN, Tomek) to increase the effectiveness of handling the minority class.

In [82]:
smt = SMOTETomek(random_state=42, sampling_strategy='minority', n_jobs=-1)
X_res, y_res = smt.fit_resample(X_df,y)



In [42]:
X_res.shape, y_res.shape

((93086, 340), (93086,))

## Model Selection


In [83]:
from typing import Tuple

def evaluate_clf(true: List[int], prediction: List[int]) -> Tuple[float, float, float, float, float]:
    """
    Evaluate the performance of a classification model.

    Args:
        true (List[int]): List of true labels.
        prediction (List[int]): List of predicted labels.

    Returns:
        Tuple[float, float, float, float, float]: Tuple containing accuracy, F1 score,
        precision, recall, and ROC AUC score.
    """
    acc = accuracy_score(true, prediction)
    f1 = f1_score(true, prediction)
    precision = precision_score(true, prediction)
    recall = recall_score(true, prediction)
    roc_auc = roc_auc_score(true, prediction)
    return acc, f1, precision, recall, roc_auc


In [84]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Logistic Regression': LogisticRegression(),
    'K-Neighbor classification': KNeighborsClassifier(),
    'XGBClassifier': XGBClassifier(),
    'CatBoost Classifier': CatBoostClassifier(verbose=False),
    'AdaBoost Classifier': AdaBoostClassifier()
}

In [85]:
from typing import Dict, Any

def evaluation_models(X: Any, y: Any, models: Dict[str, Any]) -> pd.DataFrame:
    """
    Evaluate multiple models on training and testing data.
    
    Args:
        X (Any): Input features.
        y (Any): Target variable.
        models (Dict[str, Any]): Dictionary of models to evaluate.
        
    Returns:
        pd.DataFrame: Evaluation report containing model names and accuracy scores.
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    models_list = []
    accuracy_list = []
    auc = []
    
    for i in range(len(list(models))):
        # Fit the model on the training data
        model = list(models.values())[i]
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Evaluate the model on the training data
        model_train_accuracy, model_train_f1, model_train_precision, model_train_recall, model_train_rocauc_score = evaluate_clf(y_train, y_train_pred)
        
        # Evaluate the model on the testing data
        model_test_accuracy, model_test_f1, model_test_precision, model_test_recall, model_test_rocauc_score = evaluate_clf(y_test, y_test_pred)
        
        # Print the model name
        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])
        
        # Print evaluation results on the training data
        print('Model Evaluation on Training Data')
        print('- Accuracy Score:', model_train_accuracy)
        print('- F1 Score:', model_train_f1)
        print('- Precision Score:', model_train_precision)
        print('- Recall Score:', model_train_recall)
        print('- ROC AUC Score:', model_train_rocauc_score)
        
        print('------------------------------------------------------------------------------------------------')
        
        # Print evaluation results on the testing data
        print('Model Evaluation on Testing Data')
        print('- Accuracy Score:', model_test_accuracy)
        print('- F1 Score:', model_test_f1)
        print('- Precision Score:', model_test_precision)
        print('- Recall Score:', model_test_recall)
        print('- ROC AUC Score:', model_test_rocauc_score)
        
        auc.append(model_test_rocauc_score)
        
        print('='*35)
        print()
        
    report = pd.DataFrame(list(zip(models_list, accuracy_list)), columns=['Model Name', 'Accuracy']).sort_values(by=['Accuracy'], ascending=False)
    
    return report


In [86]:
report = evaluation_models(X=X_res, y=y_res, models = models)

Random Forest
Model Evaluation on Training Data
- Accuracy Score: 0.8708950116087125
- F1 Score: 0.868658183605483
- Precision Score: 0.8838876385763107
- Recall Score: 0.8539446487531206
- ROC AUC Score: 0.8708934193846118
------------------------------------------------------------------------------------------------
Model Evaluation on Testing Data
- Accuracy Score: 0.579311825648183
- F1 Score: 0.571912383241383
- Precision Score: 0.5823784625653576
- Recall Score: 0.5618158403090792
- ROC AUC Score: 0.5793184023798644

Decision Tree
Model Evaluation on Training Data
- Accuracy Score: 0.8709084320856764
- F1 Score: 0.8636357192475085
- Precision Score: 0.9150994412065132
- Recall Score: 0.8176522696158699
- ROC AUC Score: 0.8709034294939253
------------------------------------------------------------------------------------------------
Model Evaluation on Testing Data
- Accuracy Score: 0.5591282409147029
- F1 Score: 0.5365908706200982
- Precision Score: 0.5657346817370613
- Recall 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression
Model Evaluation on Training Data
- Accuracy Score: 0.5898568035107967
- F1 Score: 0.5771215874026209
- Precision Score: 0.5955563171121773
- Recall Score: 0.5597938421066759
- ROC AUC Score: 0.5898539795611211
------------------------------------------------------------------------------------------------
Model Evaluation on Testing Data
- Accuracy Score: 0.5869343496698696
- F1 Score: 0.5728559533721898
- Precision Score: 0.5933080372542255
- Recall Score: 0.5537669027688346
- ROC AUC Score: 0.586946817295705

K-Neighbor classification
Model Evaluation on Training Data
- Accuracy Score: 0.701434648987425
- F1 Score: 0.6989621250050744
- Precision Score: 0.7047314996725605
- Recall Score: 0.6932864467291225
- ROC AUC Score: 0.701433883590004
------------------------------------------------------------------------------------------------
Model Evaluation on Testing Data
- Accuracy Score: 0.5569810510494391
- F1 Score: 0.549828178694158
- Precision Score: 0.559068219