# 🚧 EZTollGuard

<!-- ------------- -->

# 🧠 Problem Statement

With the rise of digital toll collection via FASTag, toll operators are facing increasing cases of fraudulent transactions — such as vehicle misclassification, lane misuse, underpayment, and speed bypassing — which lead to revenue loss and operational inefficienc

<!-- ---- -->

# 🎯 Objective

To develop a machine learning model that detects potential FASTag fraud in real time by analyzing transaction, vehicle, and temporal data — enabling toll operators to reduce revenue leakage and improve toll security.

<!-- --- -->

# DATA CLEANING 

<!-- -- -->

IMPORTING LIBRARIES

In [199]:
import pandas as pd

<!-- --- -->

READING THE DATASET

In [200]:
df = pd.read_csv('/Users/sarthaksharna/Downloads/FastagFraudDetection.csv')

In [201]:
df.head()

Unnamed: 0,Transaction_ID,Timestamp,Vehicle_Type,FastagID,TollBoothID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Geographical_Location,Vehicle_Speed,Vehicle_Plate_Number,Fraud_indicator
0,1,1/6/2023 11:20,Bus,FTG-001-ABC-121,A-101,Express,Large,350,120,"13.059816123454882, 77.77068662374292",65,KA11AB1234,Fraud
1,2,1/7/2023 14:55,Car,FTG-002-XYZ-451,B-102,Regular,Small,120,100,"13.059816123454882, 77.77068662374292",78,KA66CD5678,Fraud
2,3,1/8/2023 18:25,Motorcycle,,D-104,Regular,Small,0,0,"13.059816123454882, 77.77068662374292",53,KA88EF9012,Not Fraud
3,4,1/9/2023 2:05,Truck,FTG-044-LMN-322,C-103,Regular,Large,350,120,"13.059816123454882, 77.77068662374292",92,KA11GH3456,Fraud
4,5,1/10/2023 6:35,Van,FTG-505-DEF-652,B-102,Express,Medium,140,100,"13.059816123454882, 77.77068662374292",60,KA44IJ6789,Fraud


In [202]:
df.shape

(5000, 13)

<!-- ----- -->

GETTING THE PERCENTAGE AND ABSOLUTE NUMBER OF MISSING VALUES

In [203]:
missing_percent = df.isnull().mean()*100

missing_percent = missing_percent.astype(str).sort_values(ascending=False) + '%' 

print("Percentage of missing values in each column : ", '\n')

missing_percent



Percentage of missing values in each column :  



FastagID                 10.979999999999999%
Transaction_ID                          0.0%
Timestamp                               0.0%
Vehicle_Type                            0.0%
TollBoothID                             0.0%
Lane_Type                               0.0%
Vehicle_Dimensions                      0.0%
Transaction_Amount                      0.0%
Amount_paid                             0.0%
Geographical_Location                   0.0%
Vehicle_Speed                           0.0%
Vehicle_Plate_Number                    0.0%
Fraud_indicator                         0.0%
dtype: object

In [204]:
print("Number of missing values FatagID : ", '\n')

print(df['FastagID'].isnull().sum())

Number of missing values FatagID :  

549


<!-- --------------- -->

DROPPING ROWS WITH MISSING VALUES AS THERE ARE ONLY 549

In [205]:
df.dropna(subset=['FastagID'], inplace=True)

<!-- ---- -->

DATASET EXPLORATION

In [206]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4451 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Transaction_ID         4451 non-null   int64 
 1   Timestamp              4451 non-null   object
 2   Vehicle_Type           4451 non-null   object
 3   FastagID               4451 non-null   object
 4   TollBoothID            4451 non-null   object
 5   Lane_Type              4451 non-null   object
 6   Vehicle_Dimensions     4451 non-null   object
 7   Transaction_Amount     4451 non-null   int64 
 8   Amount_paid            4451 non-null   int64 
 9   Geographical_Location  4451 non-null   object
 10  Vehicle_Speed          4451 non-null   int64 
 11  Vehicle_Plate_Number   4451 non-null   object
 12  Fraud_indicator        4451 non-null   object
dtypes: int64(4), object(9)
memory usage: 486.8+ KB


In [207]:
df.describe()

Unnamed: 0,Transaction_ID,Transaction_Amount,Amount_paid,Vehicle_Speed
count,4451.0,4451.0,4451.0,4451.0
mean,2466.22714,180.927881,158.684565,67.884745
std,1428.941144,103.004437,99.857565,16.632295
min,1.0,0.0,0.0,10.0
25%,1254.5,110.0,100.0,55.0
50%,2405.0,140.0,120.0,67.0
75%,3702.5,300.0,180.0,82.0
max,5000.0,350.0,350.0,118.0


<!-- --- -->

CHECKING DUPLICATES

In [208]:
df[df.duplicated()]

# No duplicates found

Unnamed: 0,Transaction_ID,Timestamp,Vehicle_Type,FastagID,TollBoothID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Geographical_Location,Vehicle_Speed,Vehicle_Plate_Number,Fraud_indicator


<!-- --- -->

In [209]:
list(df.columns)

['Transaction_ID',
 'Timestamp',
 'Vehicle_Type',
 'FastagID',
 'TollBoothID',
 'Lane_Type',
 'Vehicle_Dimensions',
 'Transaction_Amount',
 'Amount_paid',
 'Geographical_Location',
 'Vehicle_Speed',
 'Vehicle_Plate_Number',
 'Fraud_indicator']

<!-- ---- -->

CATEGORICAL COLUMNS WITH DISCRETE VALUES

In [210]:
value_counts = [col for col in df.select_dtypes(include='object') if col not in ['FastagID'  , 'Vehicle_Plate_Number' , 'Timestamp']]

for col in value_counts:
    print(df[col].value_counts() , '\n')
    print('------------------')

Vehicle_Type
Bus           716
Car           714
Truck         714
Van           714
Sedan         714
SUV           714
Motorcycle    165
Name: count, dtype: int64 

------------------
TollBoothID
B-102    1432
A-101    1428
C-103    1426
D-106     165
Name: count, dtype: int64 

------------------
Lane_Type
Regular    2309
Express    2142
Name: count, dtype: int64 

------------------
Vehicle_Dimensions
Large     2144
Medium    1428
Small      879
Name: count, dtype: int64 

------------------
Geographical_Location
12.84197701525119, 77.67547528176169     927
12.936687032945434, 77.53113977439017    927
13.21331620748757, 77.55413526894684     880
13.042660878688794, 77.47580097259879    861
13.059816123454882, 77.77068662374292    856
Name: count, dtype: int64 

------------------
Fraud_indicator
Not Fraud    3468
Fraud         983
Name: count, dtype: int64 

------------------


<!-- ---- -->

# FEATURE ENGINEERING

<!-- - -->

EXTRACTING RELEVANT FEATURES

In [211]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])


df['day'] = df['Timestamp'].dt.day
df['month'] = df['Timestamp'].dt.month
df['year'] = df['Timestamp'].dt.year 
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek #i.e Monday = 0 , Tuesday = 1 ...

df['is_Weekend'] = df['DayOfWeek'].apply(lambda x : 1 if x in [5 , 6] else 0)

In [212]:
df['State_code']  = df['Vehicle_Plate_Number'].str[:2]

<!-- ----- -->

DROPPING IRRELEVANT COLUMNS

In [213]:
df.drop(columns = ['Transaction_ID' , 'Vehicle_Plate_Number' , 'year' , 'Timestamp' , 'Geographical_Location' , 'FastagID'] , axis = 1 ,  inplace=True)

<!-- --------- -->

CHECKING EACH TYPE OF COLUMN

In [214]:
print('There are total :' , len(df.columns) , 'columns' '\n')


categorical_col = df.select_dtypes(include='object').columns.to_list()

numerical_col = df.select_dtypes(exclude='object').columns.to_list()


print(f'There are total {len(categorical_col)} categorical columns : ' , categorical_col , '\n')

print(f'There are total {len(numerical_col)} numerical columns : '  , numerical_col)

There are total : 13 columns

There are total 6 categorical columns :  ['Vehicle_Type', 'TollBoothID', 'Lane_Type', 'Vehicle_Dimensions', 'Fraud_indicator', 'State_code'] 

There are total 7 numerical columns :  ['Transaction_Amount', 'Amount_paid', 'Vehicle_Speed', 'day', 'month', 'DayOfWeek', 'is_Weekend']


<!-- -------- -->

CHECKING FOR DUPLICATES AFTER REMOVAL OF VEHICLE PLATE NUMBER AND REMOVING IF ANY

In [215]:
df[df.duplicated()]

Unnamed: 0,Vehicle_Type,TollBoothID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Vehicle_Speed,Fraud_indicator,day,month,DayOfWeek,is_Weekend,State_code
1461,Sedan,A-101,Regular,Medium,120,120,81,Not Fraud,15,5,0,0,KA
1478,Car,A-101,Express,Small,100,100,45,Not Fraud,1,6,3,0,KA
1491,Bus,C-103,Regular,Large,340,340,82,Not Fraud,14,6,2,0,KA
2202,Van,B-102,Express,Medium,125,125,61,Not Fraud,11,9,0,0,GA
2529,Car,A-101,Regular,Small,120,120,82,Not Fraud,6,1,4,0,KA
2606,Car,A-101,Express,Small,120,120,48,Not Fraud,5,5,4,0,KA
4261,Van,B-102,Regular,Medium,120,120,43,Not Fraud,23,10,0,0,KA


In [216]:
print("There are :" , df.duplicated().sum() , "duplicated rows now")

There are : 7 duplicated rows now


In [217]:
df.drop_duplicates(inplace=True)

<!-- ---- -->

SHAPE OF DATA AFTER REMOVING MISSSING VALUES AS WELL AS DUPLICATES

In [218]:
df.shape

(4444, 13)

<!-- -------------- -->

COLUMNS

In [219]:
print('Final set of columns : ', list(df.columns))

Final set of columns :  ['Vehicle_Type', 'TollBoothID', 'Lane_Type', 'Vehicle_Dimensions', 'Transaction_Amount', 'Amount_paid', 'Vehicle_Speed', 'Fraud_indicator', 'day', 'month', 'DayOfWeek', 'is_Weekend', 'State_code']


<!-- ---- -->

FINAL CLEANED DATASET

In [220]:
df.head(2)

Unnamed: 0,Vehicle_Type,TollBoothID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Vehicle_Speed,Fraud_indicator,day,month,DayOfWeek,is_Weekend,State_code
0,Bus,A-101,Express,Large,350,120,65,Fraud,6,1,4,0,KA
1,Car,B-102,Regular,Small,120,100,78,Fraud,7,1,5,1,KA


In [223]:
df.shape

(4444, 13)

In [None]:
df.to_csv('cleaned_data.csv' , index = False)