Machine Learning Model Development
Task : Develop and deploy a machine learning model to solve a specific business problem.

Details:

Problem Definition: Identify a business problem that can be addressed with machine learning.
Data Collection: Gather and preprocess relevant data.
Model Selection: Choose and implement machine learning algorithms (e.g., classification, regression,
clustering).
Evaluation: Assess model performance using metrics like accuracy, precision, recall, and F1 score.
Deployment: Deploy the model into a production environment or integrate it with an application.
Where to Do It:
Jupyter Notebook: Develop and test machine learning models.
Google Colab: Use for developing models with cloud-based resources.
AWS SageMaker: Deploy and manage machine learning models on AWS.

# Defining fraud with model

In [25]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

In [4]:
data=pd.read_csv('creditcard.csv')

In [6]:
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55546,47004,1.283418,0.354874,0.158519,0.607335,-0.186818,-0.865444,0.084609,-0.186195,0.042276,...,-0.310652,-0.887568,0.084508,-0.012270,0.267755,0.116476,-0.022729,0.027233,0.89,0.0
55547,47005,-0.969584,-0.514812,1.010076,-0.492493,-1.074442,0.812377,1.739011,0.069684,-0.741379,...,0.137958,-0.378729,0.989131,-0.311335,-0.302619,0.720578,-0.171671,0.034326,424.80,0.0
55548,47005,-0.448083,0.936043,1.308078,0.424255,0.141240,-0.224215,1.114183,-0.286478,-0.144553,...,0.084264,0.345878,-0.222427,0.052293,0.093368,-0.387899,-0.354380,-0.223041,55.75,0.0
55549,47005,0.616391,-0.770376,-0.306186,1.374006,-0.177289,0.186992,0.417697,0.073573,-0.009001,...,0.182098,-0.039148,-0.409170,-0.315762,0.626198,-0.254219,-0.042365,0.044234,293.00,0.0


In [9]:
print(data.shape)

(55551, 31)


**Function quick overview**

In [7]:
def quick_overview(data):

    for col in data.columns:
        if data[col].dtype == 'object':
            data[col] = pd.to_numeric(data[col].str.replace(',','', regex=False), errors='ignore')

    print("===== SHAPE =====")
    print(data.shape)

    print("\n===== INFO =====")
    print(data.info())

    print("\n===== MISSING VALUES =====")
    print(data.isnull().sum())

    print("\n===== UNIQUE VALUES =====")
    print(data.nunique())

    print("\n=====Categorical columns=====")
    categorical_cols = data.select_dtypes(include='object').columns
    print(categorical_cols)

    print("\n=====Numeric columns======")
    numeric_cols = data.select_dtypes(include='number').columns
    print(numeric_cols)

    if len(numeric_cols) > 0:
        print("\n===== BASIC STATISTICS =====")
        print(data[numeric_cols].describe().T)

        print("\n===== MEDIAN =====")
        print(data[numeric_cols].median())

        print("\n===== QUANTILES =====")
        print(data[numeric_cols].quantile([0.25, 0.5, 0.75]))

    print("\n===== DUPLICATES =====")
    print(data.duplicated().sum())

    print("\n===== Describe =====")
    print(data.describe())

    print("\n===== COLUMNS =====")
    print(data.columns)



    return data

In [8]:
quick_overview(data)

===== SHAPE =====
(55551, 31)

===== INFO =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55551 entries, 0 to 55550
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    55551 non-null  int64  
 1   V1      55551 non-null  float64
 2   V2      55551 non-null  float64
 3   V3      55551 non-null  float64
 4   V4      55551 non-null  float64
 5   V5      55551 non-null  float64
 6   V6      55551 non-null  float64
 7   V7      55551 non-null  float64
 8   V8      55551 non-null  float64
 9   V9      55551 non-null  float64
 10  V10     55551 non-null  float64
 11  V11     55551 non-null  float64
 12  V12     55551 non-null  float64
 13  V13     55551 non-null  float64
 14  V14     55551 non-null  float64
 15  V15     55550 non-null  float64
 16  V16     55550 non-null  float64
 17  V17     55550 non-null  float64
 18  V18     55550 non-null  float64
 19  V19     55550 non-null  float64
 20  V20     55550 non-null  

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55546,47004,1.283418,0.354874,0.158519,0.607335,-0.186818,-0.865444,0.084609,-0.186195,0.042276,...,-0.310652,-0.887568,0.084508,-0.012270,0.267755,0.116476,-0.022729,0.027233,0.89,0.0
55547,47005,-0.969584,-0.514812,1.010076,-0.492493,-1.074442,0.812377,1.739011,0.069684,-0.741379,...,0.137958,-0.378729,0.989131,-0.311335,-0.302619,0.720578,-0.171671,0.034326,424.80,0.0
55548,47005,-0.448083,0.936043,1.308078,0.424255,0.141240,-0.224215,1.114183,-0.286478,-0.144553,...,0.084264,0.345878,-0.222427,0.052293,0.093368,-0.387899,-0.354380,-0.223041,55.75,0.0
55549,47005,0.616391,-0.770376,-0.306186,1.374006,-0.177289,0.186992,0.417697,0.073573,-0.009001,...,0.182098,-0.039148,-0.409170,-0.315762,0.626198,-0.254219,-0.042365,0.044234,293.00,0.0


In [11]:
data[["Class"]].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,55394
1.0,156


In [16]:
data.duplicated().sum()

np.int64(0)

In [18]:
data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [19]:
data.dropna(inplace=True)

In [21]:
data.shape

(55313, 31)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55313 entries, 0 to 55549
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    55313 non-null  int64  
 1   V1      55313 non-null  float64
 2   V2      55313 non-null  float64
 3   V3      55313 non-null  float64
 4   V4      55313 non-null  float64
 5   V5      55313 non-null  float64
 6   V6      55313 non-null  float64
 7   V7      55313 non-null  float64
 8   V8      55313 non-null  float64
 9   V9      55313 non-null  float64
 10  V10     55313 non-null  float64
 11  V11     55313 non-null  float64
 12  V12     55313 non-null  float64
 13  V13     55313 non-null  float64
 14  V14     55313 non-null  float64
 15  V15     55313 non-null  float64
 16  V16     55313 non-null  float64
 17  V17     55313 non-null  float64
 18  V18     55313 non-null  float64
 19  V19     55313 non-null  float64
 20  V20     55313 non-null  float64
 21  V21     55313 non-null  float64
 22  V22

In [26]:
X = data.drop('Class', axis=1)
y = data['Class']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


Standart Scaler

In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

SMOTE

In [29]:
smote = SMOTE(random_state=42)
X_train_resample, y_train_resample = smote.fit_resample(X_train_scaled, y_train)

In [30]:
print("Before SMOTE:", X_train_scaled.shape, y_train.shape)
print("After SMOTE:", X_train_resample.shape, y_train_resample.shape)

Before SMOTE: (44250, 30) (44250,)
After SMOTE: (88250, 30) (88250,)


In [31]:

print(pd.Series(y_train.value_counts(), name="Before SMOTE"))
print(pd.Series(y_train_resample.value_counts(), name="After SMOTE"))

Class
0.0    44125
1.0      125
Name: Before SMOTE, dtype: int64
Class
0.0    44125
1.0    44125
Name: After SMOTE, dtype: int64


 Model — Random Forest

In [33]:
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight=None  # there is no need class_weight =balanced
)
model.fit(X_train_resample, y_train_resample)

In [34]:

y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

In [35]:
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred, digits=4))


📊 Classification Report:
              precision    recall  f1-score   support

         0.0     0.9996    0.9996    0.9996     11032
         1.0     0.8710    0.8710    0.8710        31

    accuracy                         0.9993     11063
   macro avg     0.9353    0.9353    0.9353     11063
weighted avg     0.9993    0.9993    0.9993     11063

