In [None]:
file_path = "fraudTrain.csv"

In [None]:
import pandas as pd
df = pd.read_csv(file_path)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df = df.drop(['first','last','gender','job','street','dob'],axis=1)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop(['Unnamed: 0','trans_num'],axis=1)

In [None]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder


In [None]:
for col in ['amt','city_pop']:
    upper_limit = df[col].quantile(0.99)
    df[col] = df[col].clip(upper=df[col].quantile(0.99))
    
    valid_lat =(-90,90)
    valid_long =(-180,180)
    df = df[
        (df['lat'].between(*valid_lat)) &
        (df['long'].between(*valid_lat)) &
        (df['merch_lat'].between(*valid_lat)) &
        (df['merch_long'].between(*valid_lat)) 
    ]
    
    df = df[df['city_pop'] >= 0]

In [None]:
# Parse trans_date_trans_time to datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Validate consistency with unix_time
df['unix_time_converted'] = pd.to_datetime(df['unix_time'], unit='s')
inconsistent = df[
    abs((df['trans_date_trans_time'] - df['unix_time_converted']).dt.total_seconds()) > 60
]
if len(inconsistent) > 0:
    print(f"Found {len(inconsistent)} inconsistent timestamps. Consider dropping or correcting.")

# Drop unix_time if redundant
df = df.drop(columns=['unix_time', 'unix_time_converted'])

In [None]:
# Compute Haversine distance
def haversine_distance(row):
    customer = (row['lat'], row['long'])
    merchant = (row['merch_lat'], row['merch_long'])
    return geodesic(customer, merchant).kilometers

df['distance_km'] = df.apply(haversine_distance, axis=1)

# Flag large distances (e.g., >100 km)
df['large_distance'] = (df['distance_km'] > 100).astype(int)

In [None]:
# Target encoding for high-cardinality columns
encoder = TargetEncoder(cols=['merchant', 'city', 'state', 'zip'])
df[['merchant_encoded', 'city_encoded', 'state_encoded', 'zip_encoded']] = encoder.fit_transform(
    df[['merchant', 'city', 'state', 'zip']], df['is_fraud']
)

# One-hot encoding for category
df = pd.get_dummies(df, columns=['category'], prefix='cat')

In [None]:
# Log-transform skewed features
df['amt_log'] = np.log1p(df['amt'])
df['city_pop_log'] = np.log1p(df['city_pop'])

# Standardize numerical features
numerical_cols = ['amt_log', 'city_pop_log', 'lat', 'long', 'merch_lat', 'merch_long', 'distance_km']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [None]:
# Amount deviation from card's average
df['avg_amt_per_cc'] = df.groupby('cc_num')['amt'].transform('mean')
df['amt_deviation'] = df['amt'] - df['avg_amt_per_cc']
df['amt_deviation_flag'] = (df['amt_deviation'].abs() > df['avg_amt_per_cc']).astype(int)

# Already created large_distance in Geographical Features

In [None]:
df.head(2)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
for cols in ['city','state','cat_misc_pos','cat_personal_care','cat_shopping_net','cat_shopping_pos','cat_travel']:
    df[cols] = le.fit_transform(df[cols])

In [None]:
df.head(2)

In [None]:
df['merchant'] = le.fit_transform(df['merchant'])

In [None]:
df.head(2)

In [None]:
df['amt_deviation_flag'].value_counts()

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [None]:
for i in ['city_pop_log','avg_amt_per_cc','avg_amt_per_cc']:
    df[i]= ss.fit_transform(df[[i]])

In [None]:
df.head(2)

In [None]:
x = df.drop('amt_deviation_flag',axis=1)
y = df['amt_deviation_flag']

In [None]:
# Convert trans_date_trans_time to Unix timestamp (seconds since epoch)
if 'trans_date_trans_time' in x.columns:
    x['trans_date_trans_time'] = x['trans_date_trans_time'].astype(np.int64) // 10**9  # Convert to seconds

In [None]:
from category_encoders import TargetEncoder

# Identify remaining categorical columns
categorical_cols = x.select_dtypes(include=['object', 'string']).columns
if len(categorical_cols) > 0:
    print("Encoding categorical columns:", categorical_cols)
    encoder = TargetEncoder(cols=categorical_cols)
    x[categorical_cols] = encoder.fit_transform(x[categorical_cols], y)

In [None]:
# Select only numeric columns
numeric_cols = x.select_dtypes(include=[np.number]).columns
x = x[numeric_cols]

# Check for non-numeric columns
non_numeric_cols = x.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_cols) > 0:
    print("Non-numeric columns remaining:", non_numeric_cols)

In [None]:
import tensorflow as tf

In [None]:
x = x.to_numpy().astype(np.float32)
y = y.to_numpy().astype(np.int32)

# Step 6: Convert to tensors
x = tf.convert_to_tensor(x, dtype=tf.float32)
y = tf.convert_to_tensor(y, dtype=tf.int32)

In [None]:
import tensorflow as tf
import numpy as np

# Convert x and y to NumPy arrays if they are tensors
if isinstance(x, tf.Tensor):
    x = x.numpy()
if isinstance(y, tf.Tensor):
    y = y.numpy()

# Verify types after conversion
print("Type of x after conversion:", type(x))
print("Type of y after conversion:", type(y))

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
from keras.layers import Dense,Dropout,BatchNormalization
from keras.models import Sequential

In [None]:
model = Sequential([
    Dense(128,input_dim=x_train.shape[1],activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(64,activation='relu'),
    Dropout(0.3),
    Dense(1,activation='sigmoid')
    
])
model.summary()

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(x_train,y_train,batch_size=70,epochs=10)

In [None]:
model.predict(x_test)

In [50]:
import joblib

try:
    joblib.dump(model, 'FNN.pkl')
    print("Model saved successfully to 'FNN.pkl'.")
except Exception as e:
    print(f"Error saving files: {e}")

INFO:tensorflow:Assets written to: ram://3db918fd-5cfc-4ad4-a2a0-35ec19430491/assets
Model saved successfully to 'FNN.pkl'.
