In [22]:
import pandas as pd
import numpy as np
import logging
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score , f1_score , precision_score , recall_score , roc_auc_score , roc_curve

In [2]:
logging.basicConfig(
    
    level = logging.INFO , 
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt = '%Y-%m-%d %H:%M:%S' ,  
    filename = 'model_building.log',
    filemode = 'w'

)

logging.info('Logging has been configured successfully')

In [3]:
try :

    df = pd.read_csv('/Users/sarthaksharna/EZTollGuard/data/cleaned_data.csv')

    logging.info('Data has been loaded successfully')
    
except FileNotFoundError as e:

    logging.error(f'File not found : {e}')
    logging.error('Unable to load data , please check file path')

except Exception as e:

    logging.error(f'Unexpected Error : {e}')

In [4]:
df.head()

Unnamed: 0,Vehicle_Type,TollBoothID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Vehicle_Speed,Fraud_indicator,day,month,DayOfWeek,is_Weekend,State_code
0,Bus,A-101,Express,Large,350,120,65,Fraud,6,1,4,0,KA
1,Car,B-102,Regular,Small,120,100,78,Fraud,7,1,5,1,KA
2,Truck,C-103,Regular,Large,350,120,92,Fraud,9,1,0,0,KA
3,Van,B-102,Express,Medium,140,100,60,Fraud,10,1,1,0,KA
4,Sedan,A-101,Regular,Medium,160,100,105,Fraud,11,1,2,0,KA


<!-- --- -->

In [5]:
le = LabelEncoder()

df['Fraud_indicator'] = le.fit_transform(df['Fraud_indicator'])

<!-- -- -->

In [6]:
df.shape

(4444, 13)

In [7]:
logging.info('Train - test split started !')

try :

    X = df.drop('Fraud_indicator' , axis = 1)

    y = df['Fraud_indicator']


    X_train , X_test , y_train  , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42 , stratify = y)


    logging.info('Train - test split successfull')
    

except Exception as e :

    logging.error(f'Unexpected Error : {e}')



In [8]:
X_train.shape , y_train.shape , X_test.shape , y_test.shape

((3555, 12), (3555,), (889, 12), (889,))

In [9]:
try :

    num_cols = X.select_dtypes(exclude = 'object').columns

    cat_cols = X.select_dtypes(include = 'object').columns

    preprocessor_scaled = ColumnTransformer (
        transformers = [
            ('num' , StandardScaler() , num_cols) , 
            ('cat' , OneHotEncoder(drop='first' , handle_unknown='ignore') , cat_cols)
        ]
    )

    preprocessor_unscaled = ColumnTransformer (
        transformers = [
            ('num' , 'passthrough' , num_cols) , 
            ('cat' , OneHotEncoder(drop='first' , handle_unknown='ignore') , cat_cols)
        ]
    )

    logging.info('Preprocessors has been created successfully')
    

except Exception as e:

    logging.error(f'Unexpected Error : {e}')


In [13]:
models_scaled = {

    'Logistic Regression' : LogisticRegression() , 
    'KNN' : KNeighborsClassifier() ,
    'SVC' : SVC() ,
    'Gaussian NB' : GaussianNB()
}


models_unscaled = {

    'Decision Tree' : DecisionTreeClassifier() , 
    'Random Forest' : RandomForestClassifier() , 
    'Gradient Boosting' : GradientBoostingClassifier() , 
    'XGBoost' : XGBClassifier() , 
    'CatBoost' : CatBoostClassifier(verbose = 0) , 
    'AdaBoost' : AdaBoostClassifier()
    
}

In [11]:

for name , clf in models_scaled.items() :

    try :

        pipeline = Pipeline(
            [
                ('preprocessor_scaled' , preprocessor_scaled) , 
                ('classifier' , clf)

            ]
        )

        pipeline.fit(X_train , y_train)

        y_pred_train = pipeline.predict(X_train)

        y_pred_test = pipeline.predict(X_test)

        
        print(f"\n {name} Performance on Training Data: " '\n')

        print(f"Accuracy: {accuracy_score(y_train, y_pred_train):.4f}")

        print(f"Precision: {precision_score(y_train, y_pred_train):.4f}")

        print(f"Recall: {recall_score(y_train, y_pred_train):.4f}")

        print(f"F1 Score: {f1_score(y_train, y_pred_train):.4f}")


        print(f"\n {name} Performance on Test Data: " '\n')

        print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")

        print(f"Precision: {precision_score(y_test, y_pred_test):.4f}")

        print(f"Recall: {recall_score(y_test, y_pred_test):.4f}")

        print(f"F1 Score: {f1_score(y_test, y_pred_test):.4f}")


        print('=='*50)
        print('\n')
        


        logging.info(f'{name} has been trained successfully')
        

    except Exception as e:

        logging.error(f'Unexpected Error : {e}')



 Logistic Regression Performance on Training Data: 

Accuracy: 0.9657
Precision: 0.9578
Recall: 1.0000
F1 Score: 0.9784

 Logistic Regression Performance on Test Data: 

Accuracy: 0.9573
Precision: 0.9479
Recall: 1.0000
F1 Score: 0.9733



 KNN Performance on Training Data: 

Accuracy: 0.8920
Precision: 0.8914
Recall: 0.9809
F1 Score: 0.9340

 KNN Performance on Test Data: 

Accuracy: 0.8324
Precision: 0.8459
Recall: 0.9595
F1 Score: 0.8991



 SVC Performance on Training Data: 

Accuracy: 0.9474
Precision: 0.9367
Recall: 1.0000
F1 Score: 0.9673

 SVC Performance on Test Data: 

Accuracy: 0.9280
Precision: 0.9153
Recall: 1.0000
F1 Score: 0.9558



 Gaussian NB Performance on Training Data: 

Accuracy: 0.4453
Precision: 0.9727
Recall: 0.2961
F1 Score: 0.4540

 Gaussian NB Performance on Test Data: 

Accuracy: 0.4409
Precision: 0.9493
Recall: 0.2977
F1 Score: 0.4532




In [14]:
for name , clf in models_unscaled.items() :

    try :

        pipeline_2 = Pipeline(

            [
                ('preprocessor_unscaled' , preprocessor_unscaled) , 
                ('classifier' , clf)
            ]
        )

        pipeline_2.fit(X_train , y_train)

        y_train_pred = pipeline_2.predict(X_train)

        y_test_pred = pipeline_2.predict(X_test)

        
        print(f"\n {name} Performance on Training Data: " '\n')

        print(f"Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")

        print(f"Precision: {precision_score(y_train, y_train_pred):.4f}")

        print(f"Recall: {recall_score(y_train, y_train_pred):.4f}")

        print(f"F1 Score: {f1_score(y_train, y_train_pred):.4f}")


        print(f"\n {name} Performance on Test Data: " '\n')

        print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")

        print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")

        print(f"Recall: {recall_score(y_test, y_test_pred):.4f}")

        print(f"F1 Score: {f1_score(y_test, y_test_pred):.4f}")


        print('=='*50)
        print('\n')
        


        logging.info(f'{name} has been trained successfully')

    

    except Exception as e :

        logging.error(f'Unexpected Error : {e}')


 Decision Tree Performance on Training Data: 

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

 Decision Tree Performance on Test Data: 

Accuracy: 0.9933
Precision: 0.9928
Recall: 0.9986
F1 Score: 0.9957



 Random Forest Performance on Training Data: 

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

 Random Forest Performance on Test Data: 

Accuracy: 0.9753
Precision: 0.9692
Recall: 1.0000
F1 Score: 0.9844



 Gradient Boosting Performance on Training Data: 

Accuracy: 0.9952
Precision: 0.9939
Recall: 1.0000
F1 Score: 0.9969

 Gradient Boosting Performance on Test Data: 

Accuracy: 0.9876
Precision: 0.9844
Recall: 1.0000
F1 Score: 0.9921



 XGBoost Performance on Training Data: 

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000

 XGBoost Performance on Test Data: 

Accuracy: 0.9944
Precision: 0.9928
Recall: 1.0000
F1 Score: 0.9964



 CatBoost Performance on Training Data: 

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000

In [19]:
pipeline_cb = Pipeline([
    ('preprocessor_unscaled', preprocessor_unscaled),
    ('classifier', CatBoostClassifier(verbose=0, random_state=42))
])

pipeline_cb.fit(X_train, y_train)

In [25]:
import joblib

# Save
joblib.dump(pipeline_cb, 'catboost_pipeline.pkl')

['catboost_pipeline.pkl']