### **SAMPLING**

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.preprocessing import normalize

In [2]:
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df = pd.read_csv(url)

In [3]:
df.Class.value_counts()

0    763
1      9
Name: Class, dtype: int64

In [4]:
Amount = normalize([df['Amount']])[0]
df['Amount'] = Amount
df = df.iloc[:, 1:]
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.025729,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0.000463,1
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0.065115,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.021237,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0.012036,0


In [5]:
x = df.drop('Class', axis=1)
y = df['Class']

sampler = RandomOverSampler(sampling_strategy=0.95)
x_resample, y_resample = sampler.fit_resample(x, y)

print(y_resample.value_counts())

0    763
1    724
Name: Class, dtype: int64


In [6]:
resample = pd.concat([x_resample, y_resample], axis=1)
resample

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.025729,0
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,0.000463,1
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0.065115,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.021237,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,0.012036,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1482,-0.928088,0.398194,1.741131,0.182673,0.966387,-0.901004,0.879016,-0.156590,-0.142117,-0.574775,...,0.066353,0.281378,-0.257966,0.385384,0.391117,-0.453853,-0.104448,-0.125765,0.000172,1
1483,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,-2.772272,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.000000,1
1484,1.257719,0.364739,0.306923,0.690638,-0.357792,-1.067481,0.094272,-0.210300,0.014455,-0.286012,...,-0.286856,-0.820658,0.127663,0.343128,0.221120,0.094391,-0.022189,0.030944,0.000222,1
1485,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,-2.772272,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.000000,1


## SIMPLE RANDOM SAMPLING

In [7]:
n = int((1.96*1.96 * 0.5*0.5)/(0.05**2))
SimpleSampling = resample.sample(n=n, random_state=42)
SimpleSampling.shape

(384, 30)

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


X = SimpleSampling.drop('Class', axis=1)
y = SimpleSampling['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression()
nb_model = GaussianNB()
dt_model = DecisionTreeClassifier(random_state=42)
knn_model = KNeighborsClassifier()

models = [rf_model, lr_model, nb_model, dt_model, knn_model]
model_names = ['Random Forest', 'Logistic Regression', 'Naive Bayes', 'Decision Trees', 'KNN']
results = []
accuracies = []

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)


    X_test_np = np.array(X_test)

    y_pred = model.predict(X_test_np)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    results.append({'Model': name, 'Accuracy': accuracy})
    print(f"{name} : {accuracy:.4f}")


Random Forest : 1.0000
Logistic Regression : 0.8701
Naive Bayes : 0.7273
Decision Trees : 0.9610
KNN : 0.8701




## SYSTEMATIC SAMPLING

In [9]:
import random

SystematicSampling = resample.sample(frac=1, random_state=42).reset_index(drop=True)

sampling_interval = 2
SystematicSample = SystematicSampling.iloc[::sampling_interval]
SystematicSample.shape

(744, 30)

In [10]:
X = SystematicSample.drop('Class', axis=1)
y = SystematicSample['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression()
nb_model = GaussianNB()
dt_model = DecisionTreeClassifier(random_state=42)
knn_model = KNeighborsClassifier()

models = [rf_model, lr_model, nb_model, dt_model, knn_model]
model_names = ['Random Forest', 'Logistic Regression', 'Naive Bayes', 'Decision Trees', 'KNN']

accuracies = []

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)

    if isinstance(model, KNeighborsClassifier):
        X_test_array = X_test.values  # Convert DataFrame to NumPy array
        y_pred = model.predict(X_test_array)
    else:
        y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    results.append({'Model': name, 'Accuracy': accuracy})
    print(f"{name} : {accuracy:.4f}")

Random Forest : 1.0000
Logistic Regression : 0.8993
Naive Bayes : 0.6779
Decision Trees : 1.0000
KNN : 0.9329




## CLUSTER SAMPLING

In [11]:
from sklearn.cluster import KMeans

num_clusters = 10

kmeans = KMeans(n_clusters=num_clusters, n_init='auto', random_state=42)

clusters = kmeans.fit_predict(resample)
clusters = pd.Series(clusters)

selected_clusters = random.sample(range(num_clusters), 3)
ClusterSample = resample.loc[clusters.isin(selected_clusters)]
print(ClusterSample.shape)

(377, 30)


In [12]:
X = ClusterSample.drop('Class', axis=1)
y = ClusterSample['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression()
nb_model = GaussianNB()
dt_model = DecisionTreeClassifier(random_state=42)
knn_model = KNeighborsClassifier()

models = [rf_model, lr_model, nb_model, dt_model, knn_model]
model_names = ['Random Forest', 'Logistic Regression', 'Naive Bayes', 'Decision Trees', 'KNN']

accuracies = []

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)

    if isinstance(model, KNeighborsClassifier):
        X_test_array = X_test.values.astype(float)  # Convert DataFrame to NumPy array
        y_pred = model.predict(X_test_array)
    else:
        y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    results.append({'Model': name, 'Accuracy': accuracy})
    print(f"{name} : {accuracy:.4f}")

Random Forest : 1.0000
Logistic Regression : 0.9868
Naive Bayes : 1.0000
Decision Trees : 1.0000
KNN : 0.9868




## STRATIFIED SAMPLING

In [13]:
n = int((1.96*1.96 * 0.5*0.5)/((0.05)**2))
StratifiedSampling = resample.groupby('Class')
StratifiedSample=StratifiedSampling.sample(frac= 0.45)
StratifiedSample.shape

(669, 30)

In [14]:
X = StratifiedSample.drop('Class', axis=1)
y = StratifiedSample['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression()
nb_model = GaussianNB()
dt_model = DecisionTreeClassifier(random_state=42)
knn_model = KNeighborsClassifier()

models = [rf_model, lr_model, nb_model, dt_model, knn_model]
model_names = ['Random Forest', 'Logistic Regression', 'Naive Bayes', 'Decision Trees', 'KNN']

accuracies = []

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)

    if isinstance(model, KNeighborsClassifier):
        X_test_array = X_test.values.astype(float)  # Convert DataFrame to NumPy array
        y_pred = model.predict(X_test_array)
    else:
        y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    results.append({'Model': name, 'Accuracy': accuracy})
    print(f"{name} : {accuracy:.4f}")

Random Forest : 1.0000
Logistic Regression : 0.9254
Naive Bayes : 0.7313
Decision Trees : 0.9851
KNN : 0.9776




## BOOTSTRAP SAMPLING

In [15]:
n_bootstrap = 100
desired_sample_size = 400
BootstrapSamples = pd.DataFrame()
for _ in range(n_bootstrap):
    resampled_data = resample.sample(n=len(df), replace=True, random_state=42)
    BootstrapSamples = pd.concat([BootstrapSamples, resampled_data])
    if BootstrapSamples.shape[0] >= desired_sample_size:
        break
BootstrapSamples = BootstrapSamples.iloc[:desired_sample_size, :]
print("Final Shape of Bootstrap Samples DataFrame:", BootstrapSamples.shape)

Final Shape of Bootstrap Samples DataFrame: (400, 30)


In [16]:
X = BootstrapSamples.drop('Class', axis=1)
y = BootstrapSamples['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression()
nb_model = GaussianNB()
dt_model = DecisionTreeClassifier(random_state=42)
knn_model = KNeighborsClassifier()

models = [rf_model, lr_model, nb_model, dt_model, knn_model]
model_names = ['Random Forest', 'Logistic Regression', 'Naive Bayes', 'Decision Trees', 'KNN']

accuracies = []

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)

    if isinstance(model, KNeighborsClassifier):
        X_test_array = X_test.values.astype(float)  # Convert DataFrame to NumPy array
        y_pred = model.predict(X_test_array)
    else:
        y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    results.append({'Model': name, 'Accuracy': accuracy})
    print(f"{name} : {accuracy:.4f}")

Random Forest : 1.0000
Logistic Regression : 0.9250
Naive Bayes : 0.7375
Decision Trees : 0.9625
KNN : 0.9375




In [18]:
import pandas as pd
from tabulate import tabulate
from io import StringIO

# Provided data
data = {
    'Sample Technique': ['Simple Random Sampling', 'Systematic Sampling', 'Cluster Sampling', 'Stratified Sampling', 'Bootstrap Sampling'],
    'Random Forest': [0.9870, 1.0000, 1.0000, 1.0000, 1.0000],
    'Logistic Regression': [0.8831, 0.8926, 0.9670, 0.9030, 0.9250],
    'Naive Bayes': [0.7013, 0.7450, 1.0000, 0.7239, 0.7500],
    'Decision Trees': [0.9610, 1.0000, 1.0000, 0.9925, 0.9625],
    'KNN': [0.8701, 0.9329, 0.9890, 0.9552, 0.9375]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save DataFrame to CSV
df.to_csv('Sampling_output.csv', index=False)

# Display the CSV file content using tabulate
with open('Sampling_output.csv', 'r') as file:
    csv_content = file.read()
    table = tabulate(pd.read_csv(StringIO(csv_content)), headers='keys', tablefmt='pretty')
    print(table)

+---+------------------------+---------------+---------------------+-------------+----------------+--------+
|   |    Sample Technique    | Random Forest | Logistic Regression | Naive Bayes | Decision Trees |  KNN   |
+---+------------------------+---------------+---------------------+-------------+----------------+--------+
| 0 | Simple Random Sampling |     0.987     |       0.8831        |   0.7013    |     0.961      | 0.8701 |
| 1 |  Systematic Sampling   |      1.0      |       0.8926        |    0.745    |      1.0       | 0.9329 |
| 2 |    Cluster Sampling    |      1.0      |        0.967        |     1.0     |      1.0       | 0.989  |
| 3 |  Stratified Sampling   |      1.0      |        0.903        |   0.7239    |     0.9925     | 0.9552 |
| 4 |   Bootstrap Sampling   |      1.0      |        0.925        |    0.75     |     0.9625     | 0.9375 |
+---+------------------------+---------------+---------------------+-------------+----------------+--------+
