## **Name: Suresh Kumar R**
## **CREDIT CARD FRAUD DETECTION**

In [1]:
import pandas as pd

# Load the training and test datasets
train_df = pd.read_csv('/content/fraudTrain.csv')
test_df = pd.read_csv('/content/fraudTest.csv')

# Explore the data
print(train_df.head())
print(test_df.head())

   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

In [2]:
# Drop columns that are not useful or have too many unique values in both datasets
columns_to_drop = ['Unnamed: 0','cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'trans_num','lat','long','city_pop','merch_lat','merch_long','unix_time']
train_df = train_df.drop(columns_to_drop, axis=1)
test_df = test_df.drop(columns_to_drop, axis=1)

In [3]:
#DATA PREPROCESSING DATE-TIME EXTRACTION FROM DD-MM-YYYY FORMAT
def process_dataframes(train_df, test_df):
    def extract_features_and_clean(df):
        # Convert 'trans_date_trans_time' to datetime
        df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')

        # Extract datetime features
        df['hour'] = df['trans_date_trans_time'].dt.hour
        df['day'] = df['trans_date_trans_time'].dt.day
        df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
        df['month'] = df['trans_date_trans_time'].dt.month
        df['quarter'] = df['trans_date_trans_time'].dt.quarter

        # Drop unnecessary columns
        df = df.drop(['trans_date_trans_time', 'dob'], axis=1)

        # Clean 'merchant' column
        df['merchant'] = df['merchant'].apply(lambda x: x.replace('fraud_', ''))

        return df

    # Process both train and test dataframes
    train_df = extract_features_and_clean(train_df)
    test_df = extract_features_and_clean(test_df)

    return train_df, test_df

# Assuming train_df and test_df are already defined
train_df, test_df = process_dataframes(train_df, test_df)


In [4]:
# Bin the 'amt' column for both datasets
num_bins = 300
for df in [train_df, test_df]:
    df['amt'] = pd.cut(df['amt'], bins=num_bins, labels=False, right=False)

In [5]:
#MAP CATEGORICAL ATTRIBUTES TO BINARY FOR PREDICTION
def map_gender(df):
    df['gender_M'] = df['gender'].apply(lambda x: 1 if x == 'M' else 0)
    df['gender_F'] = df['gender'].apply(lambda x: 1 if x == 'F' else 0)
    return df.drop(['gender'], axis=1)

train_df = map_gender(train_df)
test_df = map_gender(test_df)


In [6]:
#LABEL ENCODING FOR STRING ATTRIBUTES LIKE MERCHANT,JOB AND CATEGORY OF CREDIT
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
def encode(df):
    df['merchant'] = encoder.fit_transform(df['merchant'])
    df['category'] = encoder.fit_transform(df['category'])
    df['job'] = encoder.fit_transform(df['job'])
    return df

train_df = encode(train_df)
test_df = encode(test_df)

In [7]:
#DATASET AFTER PREPROCESSING
print(train_df.head())
print(test_df.head())

   merchant  category  amt  job  is_fraud  hour  day  day_of_week  month  \
0       514         8    0  370         0     0    1            1      1   
1       241         4    1  428         0     0    1            1      1   
2       390         0    2  307         0     0    1            1      1   
3       360         2    0  328         0     0    1            1      1   
4       297         9    0  116         0     0    1            1      1   

   quarter  gender_M  gender_F  
0        1         0         1  
1        1         0         1  
2        1         1         0  
3        1         1         0  
4        1         1         0  
   merchant  category  amt  job  is_fraud  hour  day  day_of_week  month  \
0       319        10    0  275         0    12   21            6      6   
1       591        10    0  392         0    12   21            6      6   
2       611         5    0  259         0    12   21            6      6   
3       222         9    0  407         0

In [8]:
from sklearn.model_selection import train_test_split

# Split the training data into features and target variable
X_train = train_df.drop('is_fraud', axis=1)
y_train = train_df['is_fraud']

# Split the test data into features and target variable
X_test = test_df.drop('is_fraud', axis=1)
y_test = test_df['is_fraud']


In [9]:
from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Results:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))


Logistic Regression Results:
[[553059    515]
 [  2145      0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719

Accuracy: 0.995213408215303


In [11]:
from sklearn.tree import DecisionTreeClassifier

# Train Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
print("Decision Tree Results:")
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print("Accuracy:", accuracy_score(y_test, y_pred_dt))


Decision Tree Results:
[[550412   3162]
 [  1110   1035]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.25      0.48      0.33      2145

    accuracy                           0.99    555719
   macro avg       0.62      0.74      0.66    555719
weighted avg       1.00      0.99      0.99    555719

Accuracy: 0.9923126616149529


In [12]:
# Create a DataFrame with the predictions
test_df['is_fraud_pred'] = y_pred_dt

# Save to CSV
test_df[['is_fraud', 'is_fraud_pred']].to_csv('creditcard_test_predictions.csv', index=False)
