<a href="https://colab.research.google.com/github/Ridansh71/Sampling/blob/main/Sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import resample


#reading the data
df=pd.read_csv("/content/Creditcard_data.csv")
df.describe

In [4]:
#balancing the unbalanced data
X = df.drop('Class', axis=1)
y = df['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
#list of sampling techniques and models
sampling_techniques = [
    ("Simple Random Sampling", None),
    ("Systematic Sampling", None),
    ("Stratified Sampling", StratifiedKFold(n_splits=5, shuffle=True, random_state=42)),
    ("Cluster Sampling", None),
    ("Cross-Validation", StratifiedKFold(n_splits=5, shuffle=True, random_state=42)),
    ("Bootstrap Sampling", None),
]

models = [
    RandomForestClassifier(random_state=42),
    LogisticRegression(random_state=42),
    SVC(random_state=42),
    KNeighborsClassifier(),
    GradientBoostingClassifier(random_state=42),
]


In [17]:
#iterating each model through all the sampling techniques

for model in models:
    print(f"\nModel: {type(model).__name__}")

    results_dict = {}
    model_results = {}
    for technique_name, technique in sampling_techniques:
        print(f"  Sampling Technique: {technique_name}")

        if technique_name == "Simple Random Sampling":
            # Simple Random Sampling
            sampled_indices = pd.Series(X_train.index).sample(frac=1, random_state=42).index
        elif technique_name == "Systematic Sampling":
            # Systematic Sampling
            sampled_indices = list(range(0, len(X_train), 2))
        elif technique_name == "Cluster Sampling":
            if len(X_train) > 0:
                sampled_indices = resample(list(range(len(X_train))), replace=True, random_state=42)
            else:
                continue
        elif technique_name == "Bootstrap Sampling":
            # Bootstrap Sampling
            sampled_indices = resample(list(range(len(X_train))), replace=True, random_state=42)
        else:
            # Stratified Sampling and Cross-Validation
            sampled_indices = None

        if technique_name in ["Stratified Sampling", "Cross-Validation"]:
            # Use cross-validation
            scores = cross_val_score(model, X_train, y_train, cv=technique, scoring='accuracy')
            print(f"    - Average Cross-Validation Accuracy: {scores.mean():.2f}")
        else:
            # Use the sampled indices for other sampling techniques
            X_resampled, y_resampled = X_train.iloc[sampled_indices], y_train.iloc[sampled_indices]

        if len(X_resampled) > 0:

            # Train the model on the resampled data
            model.fit(X_resampled, y_resampled)

            # Make predictions on the training set
            y_train_pred = model.predict(X_train)
            # Make predictions on the test set
            y_test_pred = model.predict(X_test)

            # Evaluate the model on the training set
            train_accuracy = accuracy_score(y_train, y_train_pred)
            print(f"    - Training Accuracy: {train_accuracy:.2f}")

            # Evaluate the model on the test set
            test_accuracy = accuracy_score(y_test, y_test_pred)
            print(f"    - Test Accuracy: {test_accuracy:.2f}")

            # Store the results in the model's dictionary
            model_results[technique_name] = test_accuracy
        else:
            print(f"    - Warning: No samples after {technique_name}. Skipping...")

    # Store the model's dictionary in the overall results dictionary
    results_dict[type(model).__name__] = model_results






Model: RandomForestClassifier
  Sampling Technique: Simple Random Sampling
    - Training Accuracy: 1.00
    - Test Accuracy: 0.99
  Sampling Technique: Systematic Sampling
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Stratified Sampling
    - Average Cross-Validation Accuracy: 0.99
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Cluster Sampling
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Cross-Validation
    - Average Cross-Validation Accuracy: 0.99
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Bootstrap Sampling
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99

Model: LogisticRegression
  Sampling Technique: Simple Random Sampling
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Systematic Sampling
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Stratified Sampling


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

    - Average Cross-Validation Accuracy: 0.99
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Cluster Sampling
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Cross-Validation


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

    - Average Cross-Validation Accuracy: 0.99
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Bootstrap Sampling
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99

Model: SVC
  Sampling Technique: Simple Random Sampling


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Systematic Sampling
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Stratified Sampling
    - Average Cross-Validation Accuracy: 0.99
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Cluster Sampling
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Cross-Validation
    - Average Cross-Validation Accuracy: 0.99
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Bootstrap Sampling
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99

Model: KNeighborsClassifier
  Sampling Technique: Simple Random Sampling
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Systematic Sampling
    - Training Accuracy: 0.99
    - Test Accuracy: 0.99
  Sampling Technique: Stratified Sampling
    - Average Cross-Validation Accuracy: 0.99
    - Training Accuracy: 0.99
    - Test Accuracy: 0.

In [18]:

# Display the results
print("\nResults:")
for model, results in results_dict.items():
    best_sampling_technique = max(results, key=results.get)
    best_accuracy = results[best_sampling_technique]
    print(f"Model: {model}, Best Sampling Technique: {best_sampling_technique}, Best Test Accuracy: {best_accuracy:.2f}")


Results:
Model: GradientBoostingClassifier, Best Sampling Technique: Cluster Sampling, Best Test Accuracy: 0.99
