<a href="https://colab.research.google.com/github/Vishwas02mehta/Sampling-102317022-Predictive/blob/main/samplingcode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

df = pd.read_csv("/content/Creditcard_data.csv")
print(df.head())
print(df['Class'].value_counts())


   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [2]:
from imblearn.under_sampling import RandomUnderSampler

X = df.drop('Class', axis=1)
y = df['Class']

rus = RandomUnderSampler(random_state=42)
X_bal, y_bal = rus.fit_resample(X, y)

balanced_df = pd.concat([pd.DataFrame(X_bal, columns=X.columns), pd.Series(y_bal, name="Class")], axis=1)
print(balanced_df['Class'].value_counts())


Class
0    9
1    9
Name: count, dtype: int64


In [3]:
from sklearn.model_selection import train_test_split

samples = []
for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(
        X_bal, y_bal, test_size=0.3, random_state=42+i
    )
    samples.append((X_train, X_test, y_train, y_test))


In [4]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.combine import SMOTETomek
from imblearn.ensemble import BalancedRandomForestClassifier

sampling_methods = {
    "Sampling1": RandomUnderSampler(),
    "Sampling2": SMOTE(),
    "Sampling3": TomekLinks(),
    "Sampling4": SMOTETomek(),
    "Sampling5": None   # Balanced RF handles imbalance internally
}


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": RandomForestClassifier(),
    "M3": GradientBoostingClassifier(),
    "M4": SVC(),
    "M5": GaussianNB()
}


In [7]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.combine import SMOTETomek

sampling_methods = {
    "Sampling1": RandomUnderSampler(random_state=42),


    "Sampling2": SMOTE(k_neighbors=2, random_state=42),

    "Sampling3": TomekLinks(),

    # FIXED SMOTETomek
    "Sampling4": SMOTETomek(smote=SMOTE(k_neighbors=2, random_state=42), random_state=42),

    "Sampling5": None   # No sampling (baseline / balanced RF logic)
}


In [8]:
from sklearn.metrics import accuracy_score
import pandas as pd

# Results table
results = pd.DataFrame(index=models.keys(), columns=sampling_methods.keys())

for s_idx, (Xtr, Xte, ytr, yte) in enumerate(samples):
    print(f"\n===== SAMPLE {s_idx+1} =====")

    for samp_name, sampler in sampling_methods.items():
        print(f"\n--- {samp_name} ---")

        if sampler:
            X_res, y_res = sampler.fit_resample(Xtr, ytr)
        else:
            X_res, y_res = Xtr, ytr

        for model_name, model in models.items():
            model.fit(X_res, y_res)
            y_pred = model.predict(Xte)
            acc = accuracy_score(yte, y_pred)

            results.loc[model_name, samp_name] = acc
            print(f"{model_name} -> {acc:.4f}")

print("\n================ FINAL RESULTS TABLE ================")
print(results)



===== SAMPLE 1 =====

--- Sampling1 ---
M1 -> 0.8333
M2 -> 0.8333
M3 -> 0.6667
M4 -> 0.8333
M5 -> 0.6667

--- Sampling2 ---
M1 -> 0.6667
M2 -> 0.5000
M3 -> 0.6667
M4 -> 0.1667
M5 -> 0.3333

--- Sampling3 ---
M1 -> 0.6667
M2 -> 0.3333
M3 -> 0.6667
M4 -> 0.1667
M5 -> 0.5000

--- Sampling4 ---
M1 -> 0.6667
M2 -> 0.6667
M3 -> 0.6667
M4 -> 0.1667
M5 -> 0.6667

--- Sampling5 ---
M1 -> 0.6667
M2 -> 0.3333
M3 -> 0.6667
M4 -> 0.1667
M5 -> 0.5000

===== SAMPLE 2 =====

--- Sampling1 ---
M1 -> 0.8333
M2 -> 0.6667
M3 -> 0.5000
M4 -> 0.6667
M5 -> 0.8333

--- Sampling2 ---
M1 -> 0.8333
M2 -> 0.3333
M3 -> 0.8333
M4 -> 0.6667
M5 -> 0.3333

--- Sampling3 ---
M1 -> 0.8333
M2 -> 0.3333
M3 -> 0.8333
M4 -> 0.1667
M5 -> 0.1667

--- Sampling4 ---
M1 -> 0.5000
M2 -> 0.3333
M3 -> 0.5000
M4 -> 0.6667
M5 -> 0.1667

--- Sampling5 ---
M1 -> 0.8333
M2 -> 0.5000
M3 -> 0.6667
M4 -> 0.1667
M5 -> 0.1667

===== SAMPLE 3 =====

--- Sampling1 ---
M1 -> 0.3333
M2 -> 0.8333
M3 -> 0.6667
M4 -> 0.6667
M5 -> 0.3333

--- Sampl

In [9]:
results.to_csv("results.csv")
print("results.csv saved successfully ✅")


results.csv saved successfully ✅
