<a href="https://colab.research.google.com/github/Sameer0Rai/Sampling_Predictive_Analysis/blob/main/Sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [20]:
df = pd.read_csv('Creditcard_data.csv')
print(df.shape)
print(df['Class'].value_counts())

(772, 31)
Class
0    763
1      9
Name: count, dtype: int64


In [21]:
major = df[df['Class'] == 0]
minor = df[df['Class'] == 1]

print(len(major), len(minor))

763 9


In [22]:
X = df.drop('Class', axis=1)
y = df['Class']

In [23]:
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X, y)

In [24]:
df_bal = pd.concat([X_bal, y_bal], axis=1)
print(df_bal['Class'].value_counts())

Class
0    763
1    763
Name: count, dtype: int64


In [25]:
def simple_random_sampling(df, frac=0.7):
    return df.sample(frac=frac, random_state=42)

In [26]:
def stratified_sampling(df, target="Class", frac=0.7):
    return df.groupby(target, group_keys=False).apply(
        lambda x: x.sample(frac=frac, random_state=42)
    )

In [27]:
def systematic_sampling(df, step=2):
    return df.iloc[::step]

In [28]:
def bootstrap_sampling(df):
    return df.sample(frac=1, replace=True, random_state=42)

In [29]:
def cluster_sampling(df, cluster_col="Time"):
    df["cluster"] = pd.qcut(df[cluster_col], q=5, labels=False)
    chosen_cluster = np.random.choice(df["cluster"].unique())
    return df[df["cluster"] == chosen_cluster].drop("cluster", axis=1)

In [30]:
sampling_methods = {
    "SimpleRandom": simple_random_sampling(df_bal),
    "Systematic": systematic_sampling(df_bal),
    "Stratified": stratified_sampling(df_bal),
    "Cluster": cluster_sampling(df_bal),
    "Bootstrap": bootstrap_sampling(df_bal)
}

  return df.groupby(target, group_keys=False).apply(


In [31]:
models = {
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(),
    "LinearSVC": SVC(kernel="linear"),
    "RBFSVC": SVC(kernel="rbf"),
    "Gaussian": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

In [32]:
results = {}

for samp_name, samp_df in sampling_methods.items():
    X_s = samp_df.drop("Class", axis=1)
    y_s = samp_df["Class"]

    X_train, X_test, y_train, y_test = train_test_split(
        X_s, y_s, test_size=0.3, stratify=y_s, random_state=42
    )

    results[samp_name] = {}

    for model_name, model in models.items():
        pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("model", model)
        ])

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        results[samp_name][model_name] = round(acc * 100, 2)


results_df = pd.DataFrame(results).T
print("\nAccuracy Table (%):\n")
print(results_df)


Accuracy Table (%):

                KNN     SVC  LinearSVC  RBFSVC  Gaussian  Decision Tree  \
SimpleRandom  97.51   97.82      93.77   97.82     77.26          99.38   
Systematic    96.51   96.94      89.08   96.94     80.79          98.25   
Stratified    95.64   97.51      91.90   97.51     79.75          97.82   
Cluster       98.70  100.00      98.70  100.00    100.00          98.70   
Bootstrap     98.69   98.69      95.63   98.69     85.81          99.56   

              Random Forest  
SimpleRandom          100.0  
Systematic            100.0  
Stratified            100.0  
Cluster               100.0  
Bootstrap             100.0  


In [33]:
results_df.to_csv("result.csv")
print("Results saved to result.csv")

Results saved to result.csv
