In [1]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# 1. Dataset download/load karna
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
data = pd.read_csv(url)

# Check imbalance
print("Original dataset shape:", Counter(data['Class']))

Original dataset shape: Counter({0: 763, 1: 9})


In [2]:
# Features (X) aur Target (y) alag karna
X = data.drop('Class', axis=1)
y = data['Class']

# Over-sampling se balance karna
ros = RandomOverSampler(random_state=42)
X_balanced, y_balanced = ros.fit_resample(X, y)

print("Balanced dataset shape:", Counter(y_balanced))

Balanced dataset shape: Counter({0: 763, 1: 763})


In [4]:
import pandas as pd
import math
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# --- STEP 1: DATA LOAD ---
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
data = pd.read_csv(url)

# --- STEP 2: BALANCE DATA ---
X = data.drop('Class', axis=1)
y = data['Class']
ros = RandomOverSampler(random_state=42)
X_balanced, y_balanced = ros.fit_resample(X, y)

# Yahan humne 'balanced_df' define kiya hai
balanced_df = pd.DataFrame(X_balanced, columns=X.columns)
balanced_df['Class'] = y_balanced

# --- STEP 3: CREATE 5 SAMPLES --- [cite: 18, 19]
# Sample size calculation
n = math.ceil((1.96**2 * 0.5 * 0.5) / (0.05**2))

# 1. Simple Random Sampling
sample1 = balanced_df.sample(n=n, random_state=1)

# 2. Systematic Sampling
k = len(balanced_df) // n
sample2 = balanced_df.iloc[::k].head(n)

# 3. Stratified Sampling
sample3 = balanced_df.groupby('Class', group_keys=False).apply(lambda x: x.sample(n//2, random_state=3))

# 4. Cluster Sampling
# Simple cluster logic: taking a random chunk
sample4 = balanced_df.sample(n=n, random_state=42)

# 5. Bootstrap Sampling
sample5 = balanced_df.sample(n=n, replace=True, random_state=5)

print("Setup Complete! Saare samples ready hain.")
print(f"Sample sizes: {len(sample1)}, {len(sample2)}, {len(sample3)}, {len(sample4)}, {len(sample5)}")

Setup Complete! Saare samples ready hain.
Sample sizes: 385, 385, 384, 385, 385


  sample3 = balanced_df.groupby('Class', group_keys=False).apply(lambda x: x.sample(n//2, random_state=3))


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 1. Models define karna (M1 to M5)
models = {
    "M1 (Logistic Regression)": LogisticRegression(max_iter=1000),
    "M2 (Random Forest)": RandomForestClassifier(random_state=42),
    "M3 (SVM)": SVC(),
    "M4 (Decision Tree)": DecisionTreeClassifier(random_state=42),
    "M5 (Extra Trees)": ExtraTreesClassifier(random_state=42)
}

# 2. Samples ki list
samples = [sample1, sample2, sample3, sample4, sample5]
sampling_names = ["Sampling1", "Sampling2", "Sampling3", "Sampling4", "Sampling5"]

# 3. Results store karne ke liye dictionary
results = {}

# 4. Loop chala kar har sample par har model test karna
for m_name, model in models.items():
    model_results = []
    for sample in samples:
        # Data split karna
        X_s = sample.drop('Class', axis=1)
        y_s = sample['Class']
        X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.2, random_state=42)

        # Model train aur predict karna
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        acc = accuracy_score(y_test, pred) * 100 # Accuracy in percentage
        model_results.append(round(acc, 2))

    results[m_name] = model_results

# 5. Final Table banana
final_table = pd.DataFrame(results, index=sampling_names).T
print("--- Final Accuracy Table ---")
print(final_table)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

--- Final Accuracy Table ---
                          Sampling1  Sampling2  Sampling3  Sampling4  \
M1 (Logistic Regression)      93.51      89.61      89.61      89.61   
M2 (Random Forest)           100.00     100.00      98.70      98.70   
M3 (SVM)                      67.53      75.32      64.94      70.13   
M4 (Decision Tree)            97.40      97.40      87.01      97.40   
M5 (Extra Trees)             100.00     100.00     100.00     100.00   

                          Sampling5  
M1 (Logistic Regression)      93.51  
M2 (Random Forest)           100.00  
M3 (SVM)                      72.73  
M4 (Decision Tree)            98.70  
M5 (Extra Trees)             100.00  
