In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import normalize
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
url = "https://github.com/AnjulaMehto/Sampling_Assignment/raw/main/Creditcard_data.csv"
data = pd.read_csv(url)
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
data.Class.value_counts() #This implies that our data is imbalanced

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,763
1,9


To balance the dataset, use SMOTE (Synthetic Minority Oversampling Technique)

In [None]:
# Splitting data into features and target
X = data.drop('Class', axis=1)
y = data['Class']

# Balancing the dataset
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Confirm the balance
print(y_balanced.value_counts())


Class
0    763
1    763
Name: count, dtype: int64


In [None]:
resample = pd.concat([X_balanced, y_balanced], axis=1)
resample

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.620000,0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.690000,1
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.660000,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.500000,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.990000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521,233,-0.143424,0.517932,0.731111,0.129628,0.854012,0.001878,0.422518,0.075371,-0.127180,...,-0.086502,-0.238275,0.062436,-0.985098,-1.003529,-0.039215,0.158937,0.153083,0.992166,1
1522,288,-0.314860,0.491771,0.951783,0.141217,0.878564,-0.195386,0.522255,0.024692,-0.130443,...,-0.053105,-0.124739,-0.007566,-0.685670,-0.698822,-0.129806,0.101392,0.092159,0.993877,1
1523,465,-2.161259,-0.202359,0.365042,2.613566,0.923353,-0.447669,-2.330450,1.099415,-0.963817,...,0.471321,0.560975,-0.095592,-0.250085,-0.083285,0.508771,0.074280,-0.156735,0.726598,1
1524,516,-2.181198,-1.036044,1.153616,0.342333,1.069585,-0.553986,0.288244,0.069938,-0.076396,...,0.105787,0.020804,0.293645,-0.256581,-0.126499,0.109506,-0.313804,-0.241336,180.086978,1


In [None]:
Amount = normalize([data['Amount']])[0]
data['Amount'] = Amount
data = data.iloc[:, 1:]
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.025729,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0.000463,1
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0.065115,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.021237,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0.012036,0


**Simple Random Sampling**

In [None]:
n = int((1.96 * 1.96 * 0.5 * 0.5) / (0.05**2))

SimpleSampling = resample.sample(n=n, random_state=42)
print("Shape of Simple Random Sampling Data:", SimpleSampling.shape)

X = SimpleSampling.drop('Class', axis=1)
y = SimpleSampling['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression()
svm_model = SVC(random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
knn_model = AdaBoostClassifier(random_state=42)

models = [rf_model, lr_model, svm_model, xgb_model, knn_model]
model_names = ['Random Forest', 'Logistic Regression', 'SVM', 'XGBoost', 'AdaBoost']

accuracies = []
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"{name} : {accuracy:.4f}")

best_model_idx = np.argmax(accuracies)
best_model_name = model_names[best_model_idx]
print(f"\nBest Model for Simple Random Sampling: {best_model_name} with Accuracy: {accuracies[best_model_idx]:.4f}")

Shape of Simple Random Sampling Data: (384, 31)
Random Forest : 0.9740
Logistic Regression : 0.8442
SVM : 0.6234
XGBoost : 0.9610


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



AdaBoost : 0.9351

Best Model for Simple Random Sampling: Random Forest with Accuracy: 0.9740


**Systematic Sampling**

In [None]:
def systematic_sampling(data, step):
    return data.iloc[::step]

step_size = len(resample) // n
SystematicSampling = systematic_sampling(resample, step=step_size)
print("Shape of Systematic Sampling Data:", SystematicSampling.shape)

X = SystematicSampling.drop('Class', axis=1)
y = SystematicSampling['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

accuracies = []
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"{name} : {accuracy:.4f}")

best_model_idx = np.argmax(accuracies)
best_model_name = model_names[best_model_idx]
print(f"\nBest Model for Systematic Sampling: {best_model_name} with Accuracy: {accuracies[best_model_idx]:.4f}")

Shape of Systematic Sampling Data: (509, 31)
Random Forest : 1.0000
Logistic Regression : 0.8725
SVM : 0.7353


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



XGBoost : 0.9902
AdaBoost : 0.9804

Best Model for Systematic Sampling: Random Forest with Accuracy: 1.0000


**Stratified Sampling**

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=n, random_state=42)
for train_index, sample_index in sss.split(resample, resample['Class']):
    StratifiedSampling = resample.iloc[sample_index]

print("Shape of Stratified Sampling Data:", StratifiedSampling.shape)

X = StratifiedSampling.drop('Class', axis=1)
y = StratifiedSampling['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

accuracies = []
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"{name} : {accuracy:.4f}")

best_model_idx = np.argmax(accuracies)
best_model_name = model_names[best_model_idx]
print(f"\nBest Model for Stratified Sampling: {best_model_name} with Accuracy: {accuracies[best_model_idx]:.4f}")


Shape of Stratified Sampling Data: (384, 31)
Random Forest : 0.9870
Logistic Regression : 0.9481
SVM : 0.6494


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



XGBoost : 0.9870
AdaBoost : 0.9351

Best Model for Stratified Sampling: Random Forest with Accuracy: 0.9870


**Cluster Sampling**

In [None]:
resample['Cluster'] = np.random.randint(0, 10, size=len(resample))
chosen_cluster = 3
ClusterSampling = resample[resample['Cluster'] == chosen_cluster]

print("Shape of Cluster Sampling Data:", ClusterSampling.shape)

X = ClusterSampling.drop(['Class', 'Cluster'], axis=1)
y = ClusterSampling['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

accuracies = []
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"{name} : {accuracy:.4f}")

best_model_idx = np.argmax(accuracies)
best_model_name = model_names[best_model_idx]
print(f"\nBest Model for Cluster Sampling: {best_model_name} with Accuracy: {accuracies[best_model_idx]:.4f}")

Shape of Cluster Sampling Data: (143, 32)
Random Forest : 0.9655
Logistic Regression : 0.7931
SVM : 0.6552
XGBoost : 0.9310
AdaBoost : 0.8621

Best Model for Cluster Sampling: Random Forest with Accuracy: 0.9655


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



**Bootstrap Sampling**

In [None]:
def bootstrap_sampling(data, n_samples):
    return data.sample(n=n_samples, replace=True, random_state=42)

n_samples = n
BootstrapSampling = bootstrap_sampling(resample, n_samples)
print("Shape of Bootstrap Sampling Data:", BootstrapSampling.shape)

X = BootstrapSampling.drop('Class', axis=1)
y = BootstrapSampling['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

accuracies = []
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"{name} : {accuracy:.4f}")

best_model_idx = np.argmax(accuracies)
best_model_name = model_names[best_model_idx]
print(f"\nBest Model for Bootstrap Sampling: {best_model_name} with Accuracy: {accuracies[best_model_idx]:.4f}")

Shape of Bootstrap Sampling Data: (384, 32)
Random Forest : 1.0000
Logistic Regression : 0.9481
SVM : 0.6494
XGBoost : 0.9870


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



AdaBoost : 0.9740

Best Model for Bootstrap Sampling: Random Forest with Accuracy: 1.0000
