In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import train_test_split
from sklearn.model_selection import train_test_split
# import accuracy_score
from sklearn.metrics import accuracy_score


In [17]:
# read the csv file
df = pd.read_csv('Creditcard_data.csv')
# standardize the amount and time columns
from sklearn.preprocessing import StandardScaler
df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))
df['Time'] = StandardScaler().fit_transform(df['Time'].values.reshape(-1, 1))

X_train, X_test, y_train, y_test = train_test_split(df.drop('Class', axis=1), df['Class'], test_size=0.3, random_state=42)





In [18]:
# balance the training data using oversampling
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)
# create a data frame having X_train and Y_train
df_train = pd.concat([X_train, y_train], axis=1)
# know about 31st column of the data frame
df_train.iloc[:, 30].value_counts()

1    534
0    534
Name: Class, dtype: int64

In [19]:

def random_sampling(df, sample_size):

    df_sample = df.sample(n=sample_size, random_state=1)
    return df_sample

# create a function for systematic sampling

def systematic_sampling(data, sample_size):
    # Calculate the sampling interval
    n = len(data)
    k = np.ceil(n / sample_size)
    
    # Create the indices for the sample
    indices = np.arange(0, n, k)
    
    # Sample the data
    sample = data.iloc[indices[:sample_size]]
    
    return sample

# create a function for stratified sampling


from sklearn.model_selection import StratifiedShuffleSplit

def stratified_sampling(df, sample_size):
  
    target_col = df.columns[-1]
    strata_cols = df.columns[:-1]

    # Create a StratifiedShuffleSplit object
    sss = StratifiedShuffleSplit(n_splits=1, test_size=sample_size, random_state=42)

    # Apply the StratifiedShuffleSplit to the entire DataFrame
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    for train_index, test_index in sss.split(X, y):
        sampled_df = df.iloc[test_index]

    return sampled_df


# create a function for cluster sampling

def cluster_sampling(data, sample_size):
    # Calculate the number of clusters
    n = len(data)
    k = np.ceil(n / sample_size)
    
    # Calculate the cluster size
    cluster_size = np.ceil(n / k)
    
    # Create the clusters
    clusters = [data.iloc[int(i * cluster_size):int((i + 1) * cluster_size)] for i in range(int(k))]
    
    # Sample the clusters
    sample = pd.concat([cluster.sample(n=1, random_state=42) for cluster in clusters])
    
    return sample

# create a function for weighted sampling

def weighted_sampling(data, sample_size):
    # Create the weighted sample
    sample = data.groupby('Class', group_keys=False).apply(lambda x: x.sample(min(len(x), sample_size)))
    
    return sample




def random_sample_size():
    z=1.96
    p=0.5
    e=0.05
    n=(z**2*p*(1-p))/(e**2)
    return int(n)


def systematic_sample_size_z(population_size):
    # Calculate the population standard deviation (we assume we don't know it)
    p= 0.5  
    z=1.96
    e=0.05
  # Calculate the sample size
    n = (z**2 * population_size * p * (1 - p)) / ((z**2 * p * (1 - p)) + ((e / 2)**2 * (population_size - 1)))
    return int(n)

def stratified_sampling_size():
    z=1.96
    p=0.5
    e=0.15
    s=2
    q=e/s
    n=(z**2*p*(1-p))/(q**2)
    return int(n)

def cluster_sampling_size():
    z=1.96
    p=0.5
    e=0.15
    c=2
    q=e/c
    n=(z**2*p*(1-p))/(q**2)
    return int(n)

def weighted_sampling_size():
    z=1.96
    p=0.5
    e=0.15
    n=(z**2*p*(1-p))/(e**2)
    return int(n)


In [20]:
# apply random sampling
df_random = random_sampling(df_train, random_sample_size())
print(random_sample_size())
print(df_random.shape)
# apply systematic sampling
df_systematic = systematic_sampling(df_train, systematic_sample_size_z(len(df)))
print(df_systematic.shape)

# apply 5 ml models on df_random
# import the models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import GaussianNB
df_result=pd.DataFrame()
models=['LogisticRegression','DecisionTreeClassifier','RandomForestClassifier','SVM','KNeighborsClassifier']
df_result['models']=models
list_accuracy=[]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(df_random.drop('Class', axis=1), df_random['Class'], test_size=0.0000001, random_state=42)

# apply logistic regression 
lr = LogisticRegression()
lr.fit(X_train_r, y_train_r)
# accuracy score
#round to score 2 decimal places and append to list_accuracy
list_accuracy.append(lr.score(X_test, y_test))

# apply decision tree
dt = DecisionTreeClassifier()
dt.fit(X_train_r, y_train_r)
list_accuracy.append(dt.score(X_test, y_test))

# apply random forest
rf = RandomForestClassifier()
rf.fit(X_train_r, y_train_r)
list_accuracy.append(rf.score(X_test, y_test))

# apply svm
svm = SVC()
svm.fit(X_train_r, y_train_r)
list_accuracy.append(svm.score(X_test, y_test))

# apply knn
knn = KNeighborsClassifier()
knn.fit(X_train_r, y_train_r)
list_accuracy.append(knn.score(X_test, y_test))

df_result['Sampling1']=list_accuracy
list_accuracy=[]

# apply 5 ml models on df_systematic
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(df_systematic.drop('Class', axis=1), df_systematic['Class'], test_size=0.0000001, random_state=42)

# apply logistic regression
lr = LogisticRegression()
lr.fit(X_train_s, y_train_s)
list_accuracy.append(lr.score(X_test, y_test))

# apply decision tree
dt = DecisionTreeClassifier()
dt.fit(X_train_s, y_train_s)
list_accuracy.append(dt.score(X_test, y_test))

# apply random forest
rf = RandomForestClassifier()
rf.fit(X_train_s, y_train_s)
list_accuracy.append(rf.score(X_test, y_test))

# apply svm
svm = SVC()
svm.fit(X_train_s, y_train_s)
list_accuracy.append(svm.score(X_test, y_test))

# apply knn
knn = KNeighborsClassifier()
knn.fit(X_train_s, y_train_s)
list_accuracy.append(knn.score(X_test, y_test))

df_result['Sampling2']=list_accuracy
df_result




384
(384, 31)
(356, 31)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,models,Sampling1,Sampling2
0,LogisticRegression,0.875,0.887931
1,DecisionTreeClassifier,0.943966,0.948276
2,RandomForestClassifier,0.982759,0.982759
3,SVM,0.918103,0.909483
4,KNeighborsClassifier,0.818966,0.801724


In [21]:
# apply stratified sampling
df_stratified = stratified_sampling(df_train, stratified_sampling_size())
list_accuracy=[]

#apply 5 ml models on df_stratified
X_train_st, X_test_st, y_train_st, y_test_st = train_test_split(df_stratified.drop('Class', axis=1), df_stratified['Class'], test_size=0.0000001, random_state=42)

# apply logistic regression
lr = LogisticRegression()
lr.fit(X_train_st, y_train_st)
list_accuracy.append(lr.score(X_test, y_test))

# apply decision tree
dt = DecisionTreeClassifier()
dt.fit(X_train_st, y_train_st)
list_accuracy.append(dt.score(X_test, y_test))

# apply random forest
rf = RandomForestClassifier()
rf.fit(X_train_st, y_train_st)
list_accuracy.append(rf.score(X_test, y_test))

# apply svm
svm = SVC()
svm.fit(X_train_st, y_train_st)
list_accuracy.append(svm.score(X_test, y_test))

# apply knn
knn = KNeighborsClassifier()
knn.fit(X_train_st, y_train_st)
list_accuracy.append(knn.score(X_test, y_test))

df_result['Sampling3']=list_accuracy
df_result







  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,models,Sampling1,Sampling2,Sampling3
0,LogisticRegression,0.875,0.887931,0.866379
1,DecisionTreeClassifier,0.943966,0.948276,0.961207
2,RandomForestClassifier,0.982759,0.982759,0.987069
3,SVM,0.918103,0.909483,0.913793
4,KNeighborsClassifier,0.818966,0.801724,0.715517


In [22]:
# apply cluster sampling
df_cluster = cluster_sampling(df_train, cluster_sampling_size())
list_accuracy=[]
#apply 5 ml models on df_cluster
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(df_cluster.drop('Class', axis=1), df_cluster['Class'], test_size=0.0000001, random_state=42)

# apply logistic regression
lr = LogisticRegression()
lr.fit(X_train_c, y_train_c)
list_accuracy.append(lr.score(X_test, y_test))

# apply decision tree
dt = DecisionTreeClassifier()
dt.fit(X_train_c, y_train_c)
list_accuracy.append(dt.score(X_test, y_test))

# apply random forest
rf = RandomForestClassifier()
rf.fit(X_train_c, y_train_c)
list_accuracy.append(rf.score(X_test, y_test))

# apply svm
svm = SVC()
svm.fit(X_train_c, y_train_c)
list_accuracy.append(svm.score(X_test, y_test))

# apply knn
knn = KNeighborsClassifier()
knn.fit(X_train_c, y_train_c)
list_accuracy.append(knn.score(X_test, y_test))

df_result['Sampling4']=list_accuracy
df_result


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,models,Sampling1,Sampling2,Sampling3,Sampling4
0,LogisticRegression,0.875,0.887931,0.866379,0.077586
1,DecisionTreeClassifier,0.943966,0.948276,0.961207,0.547414
2,RandomForestClassifier,0.982759,0.982759,0.987069,0.034483
3,SVM,0.918103,0.909483,0.913793,0.012931
4,KNeighborsClassifier,0.818966,0.801724,0.715517,0.012931


In [23]:
# apply weighted sampling
df_weighted = weighted_sampling(df_train, weighted_sampling_size())
list_accuracy=[]

#apply 5 ml models on df_weighted
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(df_weighted.drop('Class', axis=1), df_weighted['Class'], test_size=0.0000001, random_state=42)

# apply logistic regression
lr = LogisticRegression()
lr.fit(X_train_w, y_train_w)
list_accuracy.append(lr.score(X_test, y_test))

# apply decision tree
dt = DecisionTreeClassifier()
dt.fit(X_train_w, y_train_w)
list_accuracy.append(dt.score(X_test, y_test))

# apply random forest
rf = RandomForestClassifier()
rf.fit(X_train_w, y_train_w)
list_accuracy.append(rf.score(X_test, y_test))

# apply svm
svm = SVC()
svm.fit(X_train_w, y_train_w)
list_accuracy.append(svm.score(X_test, y_test))

# apply knn
knn = KNeighborsClassifier()
knn.fit(X_train_w, y_train_w)
list_accuracy.append(knn.score(X_test, y_test))

df_result['Sampling5']=list_accuracy
df_result

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,models,Sampling1,Sampling2,Sampling3,Sampling4,Sampling5
0,LogisticRegression,0.875,0.887931,0.866379,0.077586,0.767241
1,DecisionTreeClassifier,0.943966,0.948276,0.961207,0.547414,0.836207
2,RandomForestClassifier,0.982759,0.982759,0.987069,0.034483,0.943966
3,SVM,0.918103,0.909483,0.913793,0.012931,0.827586
4,KNeighborsClassifier,0.818966,0.801724,0.715517,0.012931,0.469828
