# Title: Fraud Detection Model

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import preprocessing

# Load the dataset
data = pd.read_csv(r"C:\C-DRIVE Downloads\archive\onlinefraud.csv")

# Display the first and last few rows of the dataset
data.head()


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [19]:
pwd

'C:\\Users\\illas'

In [20]:
data.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [22]:
data.shape

(6362620, 11)

In [23]:
data.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [24]:
# Count values in 'isFraud' and 'isFlaggedFraud' columns
data['isFraud'].value_counts()
data['isFlaggedFraud'].value_counts()

isFlaggedFraud
0    6362604
1         16
Name: count, dtype: int64

In [25]:

# Drop 'nameOrig' and 'nameDest' columns
data = data.drop(['nameOrig', 'nameDest'], axis=1)

In [26]:
# Encode the 'type' column using label encoding
label_encoder = preprocessing.LabelEncoder()
data['type'] = label_encoder.fit_transform(data['type'])

In [27]:
data

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,3,9839.64,170136.00,160296.36,0.00,0.00,0,0
1,1,3,1864.28,21249.00,19384.72,0.00,0.00,0,0
2,1,4,181.00,181.00,0.00,0.00,0.00,1,0
3,1,1,181.00,181.00,0.00,21182.00,0.00,1,0
4,1,3,11668.14,41554.00,29885.86,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...
6362615,743,1,339682.13,339682.13,0.00,0.00,339682.13,1,0
6362616,743,4,6311409.28,6311409.28,0.00,0.00,0.00,1,0
6362617,743,1,6311409.28,6311409.28,0.00,68488.84,6379898.11,1,0
6362618,743,4,850002.52,850002.52,0.00,0.00,0.00,1,0


In [28]:
# Split the data into features (X) and target (y)
X, y = data.loc[:, data.columns != 'isFraud'], data['isFraud']

In [29]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [30]:
# Standardize the features using StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [31]:
 # Create a Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model using the training sets
gnb.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred = gnb.predict(X_test)

# Calculate and print the accuracy of the model
print("Gaussian Naive Bayes Accuracy:", metrics.accuracy_score(y_test, y_pred))

# Create a Logistic Regression model
model = LogisticRegression(random_state=0)

# Train the model using the training sets
model.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred = model.predict(X_test)

# Calculate and print the accuracy of the model
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, y_pred))


Gaussian Naive Bayes Accuracy: 0.9959110135971241
Logistic Regression Accuracy: 0.9992162557772323


In [33]:
model.score(X_test,y_test)

0.9992162557772323

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import preprocessing

# Load the dataset
data = pd.read_csv(r"C:\C-DRIVE Downloads\archive\onlinefraud.csv")

# Display the first and last few rows of the dataset
data.head()

data.tail()

# Count values in 'isFraud' and 'isFlaggedFraud' columns
data['isFraud'].value_counts()
data['isFlaggedFraud'].value_counts()


# Drop 'nameOrig' and 'nameDest' columns
data = data.drop(['nameOrig', 'nameDest'], axis=1)

# Encode the 'type' column using label encoding
label_encoder = preprocessing.LabelEncoder()
data['type'] = label_encoder.fit_transform(data['type'])

# Split the data into features (X) and target (y)
X, y = data.loc[:, data.columns != 'isFraud'], data['isFraud']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

# Standardize the features using StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

 # Create a Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model using the training sets
gnb.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred = gnb.predict(X_test)

# Calculate and print the accuracy of the model
print("Gaussian Naive Bayes Accuracy:", metrics.accuracy_score(y_test, y_pred))

# Create a Logistic Regression model
model = LogisticRegression(random_state=0)

# Train the model using the training sets
model.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred = model.predict(X_test)

# Calculate and print the accuracy of the model
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, y_pred))

model.score(X_test,y_test)


In [None]:
model.score(X_test,y_test)

In [None]:
'''Standard scaling, also known as standardization or z-score normalization, is a common preprocessing technique used in 
various data analysis and machine learning tasks. It involves transforming the features of a dataset to have a mean of 0
and a standard deviation of 1. Here are some reasons why standard scaling is used:

Normalization of Features: In many machine learning algorithms, the scale of different features can vary widely. 
Some features may have values in the tens or hundreds, while others may have values in the thousands or even millions.
This can lead to numerical instability in certain algorithms and can cause them to be sensitive to the scale of the features.
Standard scaling brings all features to a common scale, preventing issues related to scale disparities.

Facilitation of Gradient Descent: In optimization algorithms like gradient descent, which are commonly used in machine
learning for model training, having features on the same scale can help the algorithm converge faster and more reliably. 
It ensures that the steps taken during optimization are consistent across all features.

Improvement of Algorithm Performance: Many machine learning algorithms assume that features are roughly normally 
distributed and have zero mean and unit variance. Standard scaling helps fulfill these assumptions and can lead to
improved model performance.

Interpretability: Standard scaling does not change the relationship between features, only their scale. This means that 
after standard scaling, the interpretation of feature importance or coefficients in a model becomes more straightforward 
because the scale is consistent.

Regularization: Regularization techniques like L1 and L2 regularization (used in linear models such as logistic regression
and linear regression) penalize the magnitude of coefficients. Scaling features can ensure that all features are penalized
equally, preventing one feature with a large scale from dominating the regularization term.

Distance-Based Algorithms: Algorithms that rely on distance calculations, such as k-means clustering or support vector 
machines, can be sensitive to feature scales. Standard scaling ensures that all features contribute equally to distance 
calculations.

Visualization: When visualizing data or conducting exploratory data analysis (EDA), it can be helpful to have features 
on the same scale. This makes it easier to create meaningful visualizations and compare the distributions of different 
features.

Reducing Sensitivity to Outliers: Standard scaling can make certain machine learning algorithms less sensitive to outliers
because it scales the data based on the mean and standard deviation, which are less affected by extreme values compared 
to the raw data.

In summary, standard scaling is a valuable preprocessing step that helps improve the performance and stability of various
machine learning algorithms and makes data more amenable to analysis and interpretation. However, it's essential to consider
the nature of your data and the requirements of your specific problem before deciding whether to use standard scaling or 
other preprocessing techniques.



