In [5]:
##Import Libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, mean_squared_error
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [6]:
##Filter warnings
import warnings 
warnings.filterwarnings('ignore')

In [7]:
##Load the dataset and check the top 5 rows
fraud_df = pd.read_csv("fraud_detection_data_final.csv")
fraud_df.head()

Unnamed: 0,amt,trans_hour,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,...,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,age_46-60,age_61-75,age_< 30,age_> 75,is_fraud
0,134.62,23,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,109.95,5,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,20.17,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
3,839.06,22,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
4,4.33,10,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [8]:
#Check the datatypes of the columns
fraud_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4729 entries, 0 to 4728
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   amt                      4729 non-null   float64
 1   trans_hour               4729 non-null   int64  
 2   category_food_dining     4729 non-null   int64  
 3   category_gas_transport   4729 non-null   int64  
 4   category_grocery_net     4729 non-null   int64  
 5   category_grocery_pos     4729 non-null   int64  
 6   category_health_fitness  4729 non-null   int64  
 7   category_home            4729 non-null   int64  
 8   category_kids_pets       4729 non-null   int64  
 9   category_misc_net        4729 non-null   int64  
 10  category_misc_pos        4729 non-null   int64  
 11  category_personal_care   4729 non-null   int64  
 12  category_shopping_net    4729 non-null   int64  
 13  category_shopping_pos    4729 non-null   int64  
 14  category_travel         

In [10]:
y = fraud_df['is_fraud']
X = fraud_df.iloc[:, :-1]
gender_arr = X['gender_M']
X.drop(columns=['gender_M'], inplace=True)

In [12]:
X_test, X_val, y_test, y_val, gender_test, gender_val = train_test_split(X, y, gender_arr, random_state=1, test_size=0.3)

In [13]:
log_reg = LogisticRegression(penalty='none', solver='lbfgs', random_state=0, max_iter=500)

In [14]:
log_reg.fit(X_test, y_test)

### Q1. Logistic Regression: Fraud Transactions

What are the percentages of transactions labeled fraudulent for the male and female gender groups respectively?

**Note**: You can reuse the code that you wrote in the previous module or you can go ahead and retrain the logistic regression model on the data

- Use penalty='none', solver='lbfgs', random_state=0, max_iter=500 for the model.

- Use test_size=0.3, random_state = 1 for splitting the data.

In [15]:
genders = list(set(gender_arr))

In [16]:
genders

[0, 1]

In [34]:
## Write your code here
gender_table = []

for gender in genders:
    curr_gender_X_val = X_val[gender_val == gender]
    curr_gender_y_val = y_val[gender_val == gender]

    curr_gender_score = log_reg.score(curr_gender_X_val, curr_gender_y_val)

    # CUurr gender fraudlent.
    curr_gender_fraud = log_reg.predict(curr_gender_X_val)==1
    curr_gender_fraud_percentage = sum(curr_gender_fraud)/len(curr_gender_y_val)
    
    # CUurr gender non fraudlent.
    curr_gender_non_fraud = log_reg.predict(curr_gender_X_val)==0
    curr_gender_non_fraud_percentage = sum(curr_gender_non_fraud)/len(curr_gender_y_val)

    curr_cf = confusion_matrix(curr_gender_y_val, log_reg.predict(curr_gender_X_val))
    curr_fn_count = curr_cf[1,0]
    curr_fp_count = curr_cf[0,1]
    fn_count_percentage = curr_fn_count/sum(curr_gender_y_val)
    fp_count_percentage = curr_fp_count/sum(curr_gender_y_val)
    
    gender_table.append([curr_gender_score, curr_gender_fraud_percentage, curr_gender_non_fraud_percentage, fn_count_percentage, fp_count_percentage])

In [35]:
pd.DataFrame(gender_table, columns=['Accuracy', 'Fraug Percentage', 'Non fraud Percentage', 'FN', 'FP'], index=genders)

Unnamed: 0,Accuracy,Fraug Percentage,Non fraud Percentage,FN,FP
0,0.792633,0.444748,0.555252,0.251429,0.182857
1,0.835277,0.501458,0.498542,0.183562,0.126027


### Q2. Logistic Regression: Non-Fraud Transactions

What are the percentages of fraudulent transactions that were classified as non-fraud for the male and female gender groups respectively?

In [None]:
## Write your code here


### Q3. Logistic Regression

Which of the following statements are true about this model?

- The fraction of fraudulent transactions that were classified as non-fraud is higher for the male gender group compared to the female gender group
- The model classifies a larger fraction of the male gender group transactions as fraudulent transactions compared to the female gender group dataset
- There is a higher chance of incorrectly classifying a fraudulent transaction as non-fraud by a female customer compared to a fraudulent transaction by a male customer
- There is a higher chance of correctly classifying a fraudulent transaction as non-fraud by a female customer compared to a fraudulent transaction by a male customer