<center><h1><b>FEATURE ENGINEERING</b></h1></center>

In [79]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split

In [80]:
data = pd.read_csv(r"data\FastagFraudDetection.csv")

In [81]:
data.columns

Index(['Transaction_ID', 'Timestamp', 'Vehicle_Type', 'FastagID',
       'TollBoothID', 'Lane_Type', 'Vehicle_Dimensions', 'Transaction_Amount',
       'Amount_paid', 'Geographical_Location', 'Vehicle_Speed',
       'Vehicle_Plate_Number', 'Fraud_indicator'],
      dtype='object')

In [82]:
# Dropping unique identfier features and Timestamp
feature_drop_list = ['Transaction_ID','Timestamp', 'FastagID', 'Vehicle_Plate_Number']

In [83]:
data.drop(feature_drop_list, axis = 1, inplace = True)

In [84]:
data.head()

Unnamed: 0,Vehicle_Type,TollBoothID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Geographical_Location,Vehicle_Speed,Fraud_indicator
0,Bus,A-101,Express,Large,350,120,"13.059816123454882, 77.77068662374292",65,Fraud
1,Car,B-102,Regular,Small,120,100,"13.059816123454882, 77.77068662374292",78,Fraud
2,Motorcycle,D-104,Regular,Small,0,0,"13.059816123454882, 77.77068662374292",53,Not Fraud
3,Truck,C-103,Regular,Large,350,120,"13.059816123454882, 77.77068662374292",92,Fraud
4,Van,B-102,Express,Medium,140,100,"13.059816123454882, 77.77068662374292",60,Fraud


In [85]:
label_encoder = LabelEncoder()

In [86]:
data["Fraud_indicator"] = label_encoder.fit_transform(data["Fraud_indicator"])

In [87]:
data.head()

Unnamed: 0,Vehicle_Type,TollBoothID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Geographical_Location,Vehicle_Speed,Fraud_indicator
0,Bus,A-101,Express,Large,350,120,"13.059816123454882, 77.77068662374292",65,0
1,Car,B-102,Regular,Small,120,100,"13.059816123454882, 77.77068662374292",78,0
2,Motorcycle,D-104,Regular,Small,0,0,"13.059816123454882, 77.77068662374292",53,1
3,Truck,C-103,Regular,Large,350,120,"13.059816123454882, 77.77068662374292",92,0
4,Van,B-102,Express,Medium,140,100,"13.059816123454882, 77.77068662374292",60,0


In [88]:
X = data.drop('Fraud_indicator', axis = 1)
y = data["Fraud_indicator"]

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [90]:
train_data = X_train.copy()
train_data["Fraud_indicator"] = y_train.copy()
test_data = X_test.copy()
test_data["Fraud_indicator"] = y_test.copy()

In [91]:
# categorical features
encoding_features = [feature for feature in train_data.columns if train_data[feature].dtypes == 'O']
encoding_features

['Vehicle_Type',
 'TollBoothID',
 'Lane_Type',
 'Vehicle_Dimensions',
 'Geographical_Location']

In [92]:
# types of encoding
target_encoding_features = ['Vehicle_Type', 'TollBoothID', 'Geographical_Location']
label_encoding_features = ['Lane_Type']
ordinal_encoding_features = ['Vehicle_Dimensions']

In [93]:
target_encoder = TargetEncoder(cols = target_encoding_features)
X_train[target_encoding_features] = target_encoder.fit_transform(X_train[target_encoding_features], train_data["Fraud_indicator"])

In [94]:
X_train['Lane_Type'] = label_encoder.fit_transform(X_train['Lane_Type'])

In [95]:
Vehicle_Dimensions = {'Large' : 3, 'Medium' : 2, 'Small' : 1}

In [96]:
X_train["Vehicle_Dimensions"] = X_train["Vehicle_Dimensions"].apply(lambda x: Vehicle_Dimensions[x])

In [97]:
X_train.head()

Unnamed: 0,Vehicle_Type,TollBoothID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Geographical_Location,Vehicle_Speed
4227,0.79021,0.811518,1,2,110,110,0.834532,44
4676,0.744463,0.74569,0,3,145,145,0.866667,61
800,1.0,1.0,1,1,0,0,0.816813,45
3671,1.0,1.0,1,1,0,0,0.777778,96
4193,0.744463,0.74569,1,3,140,140,0.834532,74


In [98]:
X_train["Amount_Frauded"] = X_train["Transaction_Amount"] - X_train["Amount_paid"]
X_train.drop(["Transaction_Amount", "Amount_paid"], axis = 1, inplace = True)

In [99]:
# scaling
scaler = MinMaxScaler()

In [100]:
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)

In [101]:
X_train

Unnamed: 0,Vehicle_Type,TollBoothID,Lane_Type,Vehicle_Dimensions,Geographical_Location,Vehicle_Speed,Amount_Frauded
0,0.179021,0.258852,1.0,0.5,0.736690,0.314815,0.216216
1,0.000000,0.000000,0.0,1.0,1.000000,0.472222,0.216216
2,1.000000,1.000000,1.0,0.0,0.591496,0.324074,0.216216
3,1.000000,1.000000,1.0,0.0,0.271639,0.796296,0.216216
4,0.000000,0.000000,1.0,1.0,0.736690,0.592593,0.216216
...,...,...,...,...,...,...,...
3995,0.344363,0.258852,0.0,0.0,0.736690,0.527778,0.216216
3996,0.018234,0.000000,0.0,0.5,0.000000,0.472222,0.216216
3997,0.018234,0.000000,0.0,0.5,0.591496,0.388889,0.216216
3998,0.179021,0.258852,0.0,0.5,0.271639,0.527778,0.270270


<center><h1><b>FEATURE SELECTION</b></h1></center>

In [102]:
# Lasso regularization
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [103]:
feature_selection = SelectFromModel(Lasso(alpha=0.005, random_state=42))
feature_selection.fit(X_train, y_train)

In [104]:
feature_selection.get_support()

array([ True, False, False,  True,  True, False,  True])

In [107]:
print(f"Selected features are: {[feature for feature in X_train.columns[feature_selection.get_support()]]}")

Selected features are: ['Vehicle_Type', 'Vehicle_Dimensions', 'Geographical_Location', 'Amount_Frauded']


In [108]:
X_train = X_train[X_train.columns[feature_selection.get_support()]]

In [109]:
X_train

Unnamed: 0,Vehicle_Type,Vehicle_Dimensions,Geographical_Location,Amount_Frauded
0,0.179021,0.5,0.736690,0.216216
1,0.000000,1.0,1.000000,0.216216
2,1.000000,0.0,0.591496,0.216216
3,1.000000,0.0,0.271639,0.216216
4,0.000000,1.0,0.736690,0.216216
...,...,...,...,...
3995,0.344363,0.0,0.736690,0.216216
3996,0.018234,0.5,0.000000,0.216216
3997,0.018234,0.5,0.591496,0.216216
3998,0.179021,0.5,0.271639,0.270270


In [110]:
selected_features = X_train.columns

In [111]:
X_test["Amount_Frauded"] = X_test["Transaction_Amount"] - X_test["Amount_paid"]
X_test.drop(["Transaction_Amount", "Amount_paid"], axis = 1, inplace = True)

In [112]:
X_test[target_encoding_features] = target_encoder.transform(X_test[target_encoding_features], test_data["Fraud_indicator"])
X_test['Lane_Type'] = label_encoder.fit_transform(X_test['Lane_Type'])
X_test['Vehicle_Dimensions'] = X_test['Vehicle_Dimensions'].apply(lambda x: Vehicle_Dimensions[x]) 

In [113]:
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [114]:
X_test = X_test[selected_features]

In [76]:
X_test

Unnamed: 0,Vehicle_Type,Vehicle_Dimensions,Geographical_Location,Amount_Frauded
0,0.164413,1.0,0.736690,0.216216
1,1.000000,0.0,0.000000,0.216216
2,0.000000,1.0,0.000000,0.216216
3,0.179021,0.5,0.271639,0.216216
4,0.179021,0.5,0.591496,0.216216
...,...,...,...,...
995,0.000000,1.0,1.000000,0.216216
996,0.164413,1.0,1.000000,0.216216
997,0.052780,1.0,0.591496,0.216216
998,0.344363,0.0,0.000000,0.216216


In [116]:
# exporting as csv(s)
csv_files = {
    'X_train' : X_train,
    'X_test' : X_test,
    'y_train' : y_train,
    'y_test' : y_test 
}

for name, df in csv_files.items():
    df.to_csv(r"data\\{}.csv".format(name), index = None)