Machine Learning Model Development
Task : Develop and deploy a machine learning model to solve a specific business problem.

Details:

Problem Definition: Identify a business problem that can be addressed with machine learning.
Data Collection: Gather and preprocess relevant data.
Model Selection: Choose and implement machine learning algorithms (e.g., classification, regression,
clustering).
Evaluation: Assess model performance using metrics like accuracy, precision, recall, and F1 score.
Deployment: Deploy the model into a production environment or integrate it with an application.
Where to Do It:
Jupyter Notebook: Develop and test machine learning models.
Google Colab: Use for developing models with cloud-based resources.
AWS SageMaker: Deploy and manage machine learning models on AWS.

# Defining fraud with model

In [25]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

In [26]:
data=pd.read_csv('creditcard.csv')

In [3]:
# Filter rows where Class == 1.0
fraud_cases = data[data["Class"] == 1.0]

# Show first few rows
print(fraud_cases.head(20))

# Or see how many fraud cases there are
print(f"Number of fraud cases: {len(fraud_cases)}")
# Show all columns without truncation
pd.set_option('display.max_columns', None)
print(data.head())



      Time        V1        V2        V3        V4        V5        V6  \
541    406 -2.312227  1.951992 -1.609851  3.997906 -0.522188 -1.426545   
623    472 -3.043541 -3.157307  1.088463  2.288644  1.359805 -1.064823   
4920  4462 -2.303350  1.759247 -0.359745  2.330243 -0.821628 -0.075788   
6108  6986 -4.397974  1.358367 -2.592844  2.679787 -1.128131 -1.706536   
6329  7519  1.234235  3.019740 -4.304597  4.732795  3.624201 -1.357746   
6331  7526  0.008430  4.137837 -6.240697  6.675732  0.768307 -3.353060   
6334  7535  0.026779  4.132464 -6.560600  6.348557  1.329666 -2.513479   
6336  7543  0.329594  3.712889 -5.775935  6.078266  1.667359 -2.420168   
6338  7551  0.316459  3.809076 -5.615159  6.047445  1.554026 -2.651353   
6427  7610  0.725646  2.300894 -5.329976  4.007683 -1.730411 -1.732193   
6446  7672  0.702710  2.426433 -5.234513  4.416661 -2.170806 -2.667554   
6472  7740  1.023874  2.001485 -4.769752  3.819195 -1.271754 -1.734662   
6529  7891 -1.585505  3.261585 -4.1374

In [28]:
print(data.shape)

(284807, 31)


**Function quick overview**

In [29]:
def quick_overview(data):

    for col in data.columns:
        if data[col].dtype == 'object':
            data[col] = pd.to_numeric(data[col].str.replace(',','', regex=False), errors='ignore')

    print("===== SHAPE =====")
    print(data.shape)

    print("\n===== INFO =====")
    print(data.info())

    print("\n===== MISSING VALUES =====")
    print(data.isnull().sum())

    print("\n===== UNIQUE VALUES =====")
    print(data.nunique())

    print("\n=====Categorical columns=====")
    categorical_cols = data.select_dtypes(include='object').columns
    print(categorical_cols)

    print("\n=====Numeric columns======")
    numeric_cols = data.select_dtypes(include='number').columns
    print(numeric_cols)

    if len(numeric_cols) > 0:
        print("\n===== BASIC STATISTICS =====")
        print(data[numeric_cols].describe().T)

        print("\n===== MEDIAN =====")
        print(data[numeric_cols].median())

        print("\n===== QUANTILES =====")
        print(data[numeric_cols].quantile([0.25, 0.5, 0.75]))

    print("\n===== DUPLICATES =====")
    print(data.duplicated().sum())

    print("\n===== Describe =====")
    print(data.describe())

    print("\n===== COLUMNS =====")
    print(data.columns)

    return data

In [30]:
quick_overview(data)

===== SHAPE =====
(284807, 31)

===== INFO =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20 

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.551600,-0.617801,-0.991390,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.119670,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,-1.593105,2.711941,-0.689256,4.626942,-0.924459,1.107641,1.991691,0.510632,-0.682920,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,-0.150189,0.915802,1.214756,-0.675143,1.164931,-0.711757,-0.025693,-1.221179,-1.545556,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,0.411614,0.063119,-0.183699,-0.510602,1.329284,0.140716,0.313502,0.395652,-0.577252,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,-1.933849,-0.962886,-1.042082,0.449624,1.962563,-0.608577,0.509928,1.113981,2.897849,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [32]:
data.shape

(284807, 31)

In [33]:
data[["Class"]].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


In [34]:
data.duplicated().sum()

np.int64(1081)

In [35]:
data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [38]:
X = data.drop('Class', axis=1)
y = data['Class']

Standart Scaler nd SMOTE

In [50]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Scaling is fitted only on the training data.

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)   # fit + transform only train
X_test_scaled = scaler.transform(X_test)         # only transform

#  Resampling (SMOTE) — only with X_train_scaled

sm = SMOTE(random_state=42)
X_train_resample, y_train_resample = sm.fit_resample(X_train_scaled, y_train)

model.fit(X_train_resample, y_train_resample)

#  Prediction original test (scaled)
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]


In [51]:
print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE:", np.bincount(y_train_resample))

Before SMOTE: [227451    394]
After SMOTE: [227451 227451]


In [None]:
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight=None
)
model.fit(X_train_resample, y_train_resample)

y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

auc = roc_auc_score(y_test, y_prob)
print(f"\nROC-AUC Score: {auc:.4f}")

In [47]:
print(pd.Series(y_train.value_counts(), name="Before SMOTE"))
print(pd.Series(y_train_resample.value_counts(), name="After SMOTE"))

Class
0    227451
1       394
Name: Before SMOTE, dtype: int64
Class
0    227451
1    227451
Name: After SMOTE, dtype: int64


 Model — Random Forest

In [19]:
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight=None  # there is no need class_weight =balanced
)
model.fit(X_train_resample, y_train_resample)

In [48]:

y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

In [49]:
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred, digits=4))


📊 Classification Report:
              precision    recall  f1-score   support

           0     0.9997    0.9997    0.9997     56864
           1     0.8438    0.8265    0.8351        98

    accuracy                         0.9994     56962
   macro avg     0.9217    0.9131    0.9174     56962
weighted avg     0.9994    0.9994    0.9994     56962



In [24]:

from sklearn.metrics import precision_recall_curve, classification_report

y_prob = model.predict_proba(X_test_scaled)[:,1]

# Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

# lets choose optimal treshold: we want to maximize Recall

optimal_idx = np.argmax(recall - (1 - precision))
optimal_threshold = thresholds[optimal_idx]
print("Optimal threshold:", optimal_threshold)

y_pred_opt = (y_prob >= optimal_threshold).astype(int)

print("\n📊 Classification Report with tuned threshold:")
print(classification_report(y_test, y_pred_opt, digits=4))


Optimal threshold: 0.655

📊 Classification Report with tuned threshold:
              precision    recall  f1-score   support

         0.0     0.9996    1.0000    0.9998     21362
         1.0     0.9750    0.8298    0.8966        47

    accuracy                         0.9996     21409
   macro avg     0.9873    0.9149    0.9482     21409
weighted avg     0.9996    0.9996    0.9996     21409



In [None]:
import matplotlib.pyplot as plt

# Feature importance
feat_imp = pd.Series(model.feature_importances_, index=X.columns)
feat_imp = feat_imp.sort_values(ascending=False)

# Top 10 feature
plt.figure(figsize=(10,6))
feat_imp[:10].plot(kind='barh', color='skyblue')
plt.gca().invert_yaxis()  # most important feauture is top
plt.title("Top 10 Most Important Features (Random Forest)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()


In [None]:
import seaborn as sns

# the list of Top 10 feature
top_features = ['V14','V10','V12','V3','V4','V17','V11','V16','V2','V9']

# graph for Fraud nd non-fraud
plt.figure(figsize=(15,8))
for i, feature in enumerate(top_features):
    plt.subplot(2, 5, i+1)
    sns.kdeplot(data[data['Class']==0], x=feature, fill=True, label='Non-Fraud', alpha=0.5)
    sns.kdeplot(data[data['Class']==1], x=feature, fill=True, label='Fraud', alpha=0.5)
    plt.title(feature)
    if i == 0:
        plt.legend()

plt.tight_layout()
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)

# Randomized Search (5-fold CV)
search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid,
                            n_iter=20, cv=5, scoring='f1', n_jobs=-1, random_state=42)
search.fit(X_train_resample, y_train_resample)


best_rf = search.best_estimator_
print("Best Parameters:", search.best_params_)

y_pred_best = best_rf.predict(X_test_scaled)
print("\n📊 Classification Report (Tuned RF):")
print(classification_report(y_test, y_pred_best, digits=4))


In [None]:
import pickle

with open("best_rf_model.pkl", "wb") as f:
    pickle.dump(best_rf, f)

print("model saved in pickle")


XGBOOST

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report


xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)


param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3, 0.5],
    'min_child_weight': [1, 3, 5]
}

search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=20,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

search.fit(X_train_resample, y_train_resample)

best_xgb = search.best_estimator_
print("Best Parameters:", search.best_params_)

y_pred_best = best_xgb.predict(X_test_scaled)
print("\n📊 Classification Report (Tuned XGBoost):")
print(classification_report(y_test, y_pred_best, digits=4))


Pickle

In [None]:
import pickle

with open("best_xgb.pkl", "wb") as f:
    pickle.dump(search.best_estimator_, f)


Sagemaker

In [None]:
pip install boto3

In [None]:
import gradio as gr
import pickle
import numpy as np

with open("best_rf_model.pkl", "rb") as f:
    model = pickle.load(f)

def predict_fraud(V14, V10, V12, V3, V4, V17, V11, V16, V2, V9):
    data = np.array([[V14, V10, V12, V3, V4, V17, V11, V16, V2, V9]])
    proba = model.predict_proba(data)[:,1][0]
    label = "Fraud" if proba > 0.5 else "Not Fraud"
    return f"{label} ({proba:.2%})"

iface = gr.Interface(
    fn=predict_fraud,
    inputs=[gr.Number(label=f) for f in ["V14","V10","V12","V3","V4","V17","V11","V16","V2","V9"]],
    outputs="text"
)
iface.launch()

