In [76]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE


In [41]:
def simple_random_sampling(X, y, sample_size):
    idx = np.random.choice(range(len(X)), size=sample_size, replace=False)
    return X.iloc[idx], y.iloc[idx]
    


In [42]:
def stratified_sampling(X, y, sample_size):
    from sklearn.model_selection import StratifiedShuffleSplit
    sss = StratifiedShuffleSplit(n_splits=1, test_size=sample_size/len(y), random_state=42)
    for train_index, test_index in sss.split(X, y):
        return X.iloc[test_index], y.iloc[test_index]


In [43]:
def cluster_sampling(X, y, sample_size):
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=sample_size).fit(X)
    idx = kmeans.predict(X)
    return X.iloc[idx == 0], y.iloc[idx == 0]
    

In [44]:
def convenience_sampling(X, y, sample_size):
    return X.head(sample_size), y.head(sample_size)

In [45]:
def systematic_sampling(X, y, sample_size):
    step = len(X) // sample_size
    idx = np.arange(0, len(X), step)
    return X.iloc[idx], y.iloc[idx]


In [77]:

def calculate_sample_size(data, confidence_level=0.95, margin_of_error=0.05, stratified=False):
    if stratified:
        sample_sizes = []
        for group in data['Class'].unique():
            group_data = data[data['Class'] == group]
            n = len(group_data)
            p = 0.5  # assume proportion is 50% to get the most conservative estimate of the sample size
            z = np.abs(stats.norm.ppf(1 - (1 - confidence_level) / 2))
            sample_size = (z**2 * p * (1 - p)) / (margin_of_error**2 + (z**2 * p * (1 - p)) / n)
            sample_sizes.append(np.ceil(sample_size))
        return int(np.max(sample_sizes))
    else:
        n = len(data)
        p = 0.5
        z = np.abs(stats.norm.ppf(1 - (1 - confidence_level) / 2))
        sample_size = (z**2 * p * (1 - p)) / (margin_of_error**2 + (z**2 * p * (1 - p)) / n)
        return int(np.ceil(sample_size))


In [52]:
try:
    data = pd.read_csv("Creditcard_data.csv")
except FileNotFoundError:
    print("Dataset file not found.")
    exit()
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()

# Check if 'Class' column is present in the dataset
if 'Class' not in data.columns:
    print("Column 'Class' not found in the dataset.")
    exit()


In [54]:
samplings = {'simple_random_sampling': simple_random_sampling, 
             'stratified_sampling': stratified_sampling,
             'cluster_sampling': cluster_sampling, 
             'convenience_sampling': convenience_sampling, 
             'systematic_sampling': systematic_sampling}

In [55]:
models = {'Decision Tree': DecisionTreeClassifier(),
          'Random Forest': RandomForestClassifier(),
          'Logistic Regression': LogisticRegression(),
          'SVC': SVC(),
          'XGB Classifier': XGBClassifier(use_label_encoder=False, eval_metric='logloss')}


In [56]:
confidence_level = 0.95
confidence_interval = 0.05
sample_sizes = [calculate_sample_size(data, confidence_level, confidence_interval) for i in range(5)]

In [57]:
results = {}

In [83]:
for sampling_name, sampling in samplings.items():
    results[sampling_name] = {}
    for i in range(len(sample_sizes)):
        sample_size = calculate_sample_size(data, confidence_level=0.95, margin_of_error=0.05, stratified=True)
        results[sampling_name][sample_size] = {}
        X_sample, y_sample = sampling(data.drop('Class', axis=1), data['Class'], sample_size)
        X_sample, y_sample = SMOTE(k_neighbors=1).fit_resample(X_sample, y_sample)
        for model_name, model in models.items():
            scores = []
            model.fit(X_sample, y_sample)
            score = np.mean(cross_val_score(model, X_sample, y_sample, cv=5))
            results[sampling_name][sample_size][model_name] = score



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

ValueError: ignored