First we need to balance our dataset

In [1]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
df = pd.read_csv('Creditcard_data.csv')

# Separate the target variable and features
X = df.drop('Class', axis=1)
y = df['Class']

# Instantiate a RandomOverSampler object and resample the data
ros = RandomOverSampler(random_state=42)
X_resampled_1, y_resampled_1 = ros.fit_resample(X, y)

# Instantiate a RandomUnderSampler object and resample the data
rus = RandomUnderSampler(random_state=42)
X_resampled_2, y_resampled_2 = rus.fit_resample(X_resampled_1, y_resampled_1)

# Create a new balanced dataframe
df_resampled = pd.concat([X_resampled_2, y_resampled_2], axis=1)

# Save the balanced dataframe to a new CSV file
df_resampled.to_csv('Balanced_data.csv', index=False)

In [2]:
class_counts = df_resampled['Class'].value_counts()

print("Class distribution:")
print(class_counts)

Class distribution:
0    763
1    763
Name: Class, dtype: int64


Our Data is now balanced

SIMPLE RANDOM SAMPLING

In [2]:
from sklearn.model_selection import train_test_split
import math
X1 = df_resampled.drop('Class', axis=1)
y1 = df_resampled['Class']
N = len(df_resampled)
p = 0.5
c = 0.95  # desired confidence level
e = 0.05  # desired margin of error

z = 1.96  # z-score for 95% confidence level
n = math.ceil((z**2 * p * (1-p)) / e**2)

# Set the sample size
sample_size = n  # Set the desired sample size
X_sampled = X1.sample(n=sample_size, random_state=0)
y_sampled = y1[X_sampled.index]  # Match the sampled output variable with the sampled input variables
    
# Combine the sampled input and output variables into a single DataFrame
sampled_df = pd.concat([X_sampled, y_sampled], axis=1)
    
# Save the sampled DataFrame to a CSV file
sampled_df.to_csv(f'simple_random_dataset.csv', index=False)

In [9]:
#Performing metrics analysis
from pycaret.datasets import get_data
from pycaret.classification import *
DataSet = get_data("simple_random_dataset")
setup(data=DataSet, target='Class', silent=True)
cm = compare_models(include=['rf', 'lr','nb','svm','dt','knn'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9926,1.0,1.0,0.9857,0.9926,0.9852,0.9857,0.138
dt,Decision Tree Classifier,0.9704,0.9717,1.0,0.9442,0.9706,0.9409,0.9438,0.03
lr,Logistic Regression,0.9295,0.9527,1.0,0.8778,0.9334,0.8598,0.8709,0.096
knn,K Neighbors Classifier,0.9259,0.9717,1.0,0.8704,0.9294,0.8527,0.8642,0.017
nb,Naive Bayes,0.8256,0.8559,0.7596,0.8623,0.797,0.6481,0.6635,0.008
svm,SVM - Linear Kernel,0.5168,0.0,0.666,0.4053,0.4909,0.0438,0.0521,0.01


STRATIFIED SAMPLING

In [18]:
import pandas as pd
import numpy as np
import math

# Read the CSV file into a DataFrame
df = pd.read_csv('Balanced_data.csv')

# Separate the feature matrix X and the target variable y
X = df.drop(columns=['Class'])
y = df['Class']

# Determine the number of strata (in this case, we use a binary target variable, so there are two strata)
num_strata = 2

# Initialize an empty list to store the stratified samples
samples = []

# Loop over each stratum
for i in range(num_strata):
    # Subset the data to include only the observations in the current stratum
    stratum_data = df[df['Class'] == i]
    
    # Calculate the sample size for the current stratum
    stratum_size = len(stratum_data)
    population_size = len(df)
    desired_margin_of_error = 0.05
    confidence_level = 0.95
    z_score = 1.96  # for a 95% confidence level
    p = stratum_size / population_size
    q = 1 - p
    n = (z_score**2 * p * q * population_size) / ((z_score**2 * p * q) + (desired_margin_of_error**2 * (population_size-1)))
    n = math.ceil(n)
    
    # If the calculated sample size for the current stratum is greater than the number of observations in the stratum, set the sample size to the number of observations
    if n > stratum_size:
        n = stratum_size
    
    # Randomly select observations from the current stratum to include in the sample
    sample_indices = np.random.choice(stratum_data.index, size=n, replace=False)
    stratum_sample = stratum_data.loc[sample_indices]
    
    # Add the current stratum sample to the list of stratified samples
    samples.append(stratum_sample)

# Combine the stratified samples into a single DataFrame
stratified_sample = pd.concat(samples)

# Write the stratified sample to a new CSV file
stratified_sample.to_csv('stratified_dataset.csv', index=False)


In [19]:
from pycaret.datasets import get_data
from pycaret.classification import *
DataSet = get_data("stratified_dataset")
setup(data=DataSet, target='Class', silent=True)
cm = compare_models(include=['rf', 'lr','nb','svm','dt','knn'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9977,1.0,1.0,0.9957,0.9978,0.9953,0.9954,0.144
dt,Decision Tree Classifier,0.9884,0.9881,1.0,0.9783,0.9889,0.9767,0.9772,0.009
knn,K Neighbors Classifier,0.9536,0.9833,1.0,0.9209,0.9579,0.9069,0.9128,0.018
lr,Logistic Regression,0.9117,0.9273,1.0,0.8566,0.9218,0.8225,0.8375,0.089
nb,Naive Bayes,0.7429,0.8281,0.6717,0.7945,0.7146,0.4893,0.5004,0.006
svm,SVM - Linear Kernel,0.5198,0.0,0.5591,0.5074,0.4717,0.039,0.0196,0.007


CLUSTER SAMPLING

In [12]:
import pandas as pd
import numpy as np
import math
from sklearn.cluster import KMeans
# Load the data
data = pd.read_csv('Balanced_data.csv')
# Define the cluster size
C = 600
# Calculate the number of clusters
n_clusters = math.ceil(len(data)/C)
# Create a KMeans object with the calculated number of clusters
kmeans = KMeans(n_clusters=n_clusters)
# Fit the KMeans object to the data
kmeans.fit(data.drop('Class', axis=1))
# Add the cluster labels to the data
data['Cluster'] = kmeans.labels_
# Calculate the sample size
z = 1.96
p = 0.5
e = 0.05
s = 1
N = len(data)
C = len(data['Cluster'].unique())
n = math.ceil((z**2 * p * (1-p) * (N/C)) / ((e**2) + ((z**2 * p * (1-p))/(C-1))))

# Create an empty dataframe to hold the sample
sample = pd.DataFrame(columns=data.columns)
# Loop through each cluster
for i in range(n_clusters):
    # Get the data for the current cluster
    cluster_data = data[data['Cluster'] == i]
    # Calculate the cluster sample size
    cluster_size = len(cluster_data)
    cluster_sample_size = math.ceil((cluster_size/N)*n)
    # If the cluster sample size is larger than the cluster size, set it to the cluster size
    if cluster_sample_size > cluster_size:
        cluster_sample_size = cluster_size
    # Sample from the current cluster
    cluster_sample = cluster_data.sample(n=cluster_sample_size, replace=False)    
    # Add the cluster sample to the overall sample dataframe
    sample = pd.concat([sample, cluster_sample])
# Remove the cluster column from the sample
sample = sample.drop('Cluster', axis=1)
# Save the sample to a CSV file
sample.to_csv('cluster_sample_dataset.csv', index=False)


In [13]:
from pycaret.datasets import get_data
from pycaret.classification import *
DataSet = get_data("cluster_sample_dataset")
setup(data=DataSet, target='Class', silent=True)
cm = compare_models(include=['rf', 'lr','nb','svm','dt','knn'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9986,1.0,1.0,0.9972,0.9986,0.9972,0.9972,0.131
dt,Decision Tree Classifier,0.9915,0.9917,1.0,0.9835,0.9916,0.9831,0.9834,0.024
knn,K Neighbors Classifier,0.9648,0.9889,1.0,0.9351,0.966,0.9297,0.9328,0.013
lr,Logistic Regression,0.9127,0.9453,1.0,0.8522,0.9195,0.8258,0.8399,0.137
nb,Naive Bayes,0.7873,0.8698,0.7514,0.8053,0.772,0.574,0.5808,0.006
svm,SVM - Linear Kernel,0.5141,0.0,0.6571,0.532,0.5262,0.0327,0.0064,0.008


SYSTEMATIC SAMPLING

In [14]:
import pandas as pd
import math

# Load the dataset "Balanced_data" into a Pandas dataframe
df = pd.read_csv("Balanced_data.csv")

# Calculate the number of rows in the dataset
n = len(df)

# Set the sampling interval "k" as the square root of the number of rows in the dataset
k = int(math.sqrt(n))

# Select every "k" row starting from a random index in the dataset
sample = df.iloc[::k]

sample.to_csv('systematic_sample_dataset.csv', index=False)


In [27]:
from pycaret.datasets import get_data
from pycaret.classification import *
DataSet = get_data("systematic_sample_dataset")
setup(data=DataSet, target='Class', silent=True)
cm = compare_models(include=['rf', 'lr','nb','svm','dt','knn'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.95,1.0,1.0,0.95,0.9667,0.9,0.9,0.103
lr,Logistic Regression,0.9,1.0,1.0,0.8833,0.9267,0.74,0.75,0.021
dt,Decision Tree Classifier,0.85,0.8,1.0,0.85,0.9067,0.6,0.6,0.006
nb,Naive Bayes,0.7667,0.8,1.0,0.75,0.8367,0.44,0.45,0.006
knn,K Neighbors Classifier,0.7667,0.9,0.9,0.8,0.8167,0.53,0.55,0.016
svm,SVM - Linear Kernel,0.5833,0.0,0.7,0.65,0.5967,0.16,0.2,0.02


CONVENIENCE SAMPLING

In [16]:
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('Balanced_data.csv')

# Calculate the sample size based on practical considerations
n = 200 # Set the desired sample size

# Select a convenience sample from the data
sample = data.sample(n=n, replace=False)

# Save the sample to a CSV file
sample.to_csv('convenience_sample_data.csv', index=False)


In [24]:
from pycaret.datasets import get_data
from pycaret.classification import *
DataSet = get_data("convenience_sample_data")
setup(data=DataSet, target='Class', silent=True)
cm = compare_models(include=['rf', 'lr','nb','svm','dt','knn'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9923,1.0,1.0,0.9833,0.9909,0.9843,0.9854,0.112
dt,Decision Tree Classifier,0.9637,0.9694,1.0,0.9238,0.9587,0.9266,0.9313,0.006
lr,Logistic Regression,0.9121,0.9275,0.9433,0.8612,0.8982,0.8207,0.827,0.057
knn,K Neighbors Classifier,0.8412,0.9192,0.9433,0.7564,0.8313,0.6857,0.714,0.013
nb,Naive Bayes,0.6978,0.8494,0.3633,0.7883,0.4807,0.3128,0.365,0.009
svm,SVM - Linear Kernel,0.4956,0.0,0.47,0.2833,0.3199,-0.0023,0.0008,0.006


# Now we are done with the five diffrerent sampling techniques and have calculated various metrics using pycaret for different classification based ML models.