<a href="https://colab.research.google.com/github/Noor291/Sampling/blob/main/Sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import random
import math
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
ccdata=pd.read_csv("Creditcard_data.csv")
print(ccdata.head()) #Binary classification

   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [None]:
#Convert this data-set into balanced class data-set. (Using the techniques as discussed in the class)
print(ccdata['Class'].value_counts()[0])
print(ccdata['Class'].value_counts()[1])
data0=ccdata.loc[ccdata['Class'].isin([0])]
data1=ccdata.loc[ccdata['Class'].isin([1])]

763
9


In [None]:
sm = SMOTE(random_state=58)
x=ccdata.iloc[:,:-1]
y=ccdata.iloc[:,-1]
xn, yn = sm.fit_resample(x, y)
print((yn==0).sum())
print((yn==1).sum())

763
763


In [None]:
# New Balanced Dataset
df=pd.DataFrame(xn)
df['Class']=yn

In [None]:
#Create five samples
#1Simple Random Sampling
z=1.96
p=0.5
E=0.05
sample_size = math.ceil((z*z*p*(1-p))/(E*E))
samples=[]
s1 = df.sample(n=sample_size, random_state=0)
samples.append(s1)

In [None]:
#Cluster Sampling
z=1.96
p=0.5
E=0.05
C=1.5
sample_size = round(((z**2)*p*(1-p))/((E/C)**2))
num_select_clusters=2
df_new=df
# print(sample_size)
N = len(df)
K = int(N/sample_size)
data = None
for k in range(K):
    sample_k = df_new.sample(sample_size)
    sample_k["cluster"] = np.repeat(k,len(sample_k))
    df_new = df_new.drop(index = sample_k.index)
    data = pd.concat([data,sample_k],axis = 0)

random_chosen_clusters = np.random.randint(0,K,size = num_select_clusters)
s2 = data[data.cluster.isin(random_chosen_clusters)]
s2.drop(['cluster'], axis=1, inplace=True)
samples.append(s2)

In [None]:
#systematic sampling
n = len(df)
k = int(math.sqrt(n))
s3 = df.iloc[::k]
samples.append(s3)

In [None]:
#stratified sampling
s4 = df.groupby('Class', group_keys=False).apply(lambda x: x.sample(frac=0.6))
samples.append(s4)

In [None]:
#Convenience sampling
s5=df.head(400)
samples.append(s5)

In [None]:
#Apply five different sampling techniques
headings=['Simple Random','Cluster','Systematic','Stratified','Convenience']
ans=pd.DataFrame(columns=headings, index=['Decision Tree','KNN','Logistic Regression','SVM','Random Forest'])
ans

Unnamed: 0,Simple Random,Cluster,Systematic,Stratified,Convenience
Decision Tree,,,,,
KNN,,,,,
Logistic Regression,,,,,
SVM,,,,,
Random Forest,,,,,


In [None]:
#SAMPLE1
x_s=samples[0].drop('Class',axis=1)
y_s=samples[0]['Class']
# Splitting into train and test
xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104, test_size=0.25, shuffle=True)
# Applying Decision Tree
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100,max_depth = 3, min_samples_leaf = 5)
clf_entropy.fit(xtrain, y_train)
y_pred=clf_entropy.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[0,0]=acc*100
# Applying KNN
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(xtrain, y_train)
y_pred=knn.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[1,0]=acc*100
# Applying Logistic Regression
classifier = LogisticRegression(random_state = 0,max_iter=2000)
classifier.fit(xtrain, y_train)
y_pred = classifier.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[2,0]=acc*100
# Applying SVM
clf = SVC(kernel='rbf')
clf.fit(xtrain, y_train) 
y_pred=clf.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[3,0]=acc*100
# Applying RandomForest Classifier
clf = RandomForestClassifier(n_estimators = 100) 
clf.fit(xtrain, y_train)
y_pred = clf.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[4,0]=acc*100

In [None]:
#SAMPLE2
x_s=samples[1].drop('Class',axis=1)
y_s=samples[1]['Class']
# Splitting into train and test
xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104, test_size=0.25, shuffle=True)
# Applying Decision Tree
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100,max_depth = 3, min_samples_leaf = 5)
clf_entropy.fit(xtrain, y_train)
y_pred=clf_entropy.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[0,1]=acc*100
# Applying KNN
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(xtrain, y_train)
y_pred=knn.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[1,1]=acc*100
# Applying Logistic Regression
classifier = LogisticRegression(random_state = 0,max_iter=2000)
classifier.fit(xtrain, y_train)
y_pred = classifier.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[2,1]=acc*100
# Applying SVM
clf = SVC(kernel='rbf')
clf.fit(xtrain, y_train) 
y_pred=clf.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[3,1]=acc*100
# Applying RandomForest Classifier
clf = RandomForestClassifier(n_estimators = 100) 
clf.fit(xtrain, y_train)
y_pred = clf.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[4,1]=acc*100

In [None]:
#SAMPLE3
x_s=samples[2].drop('Class',axis=1)
y_s=samples[2]['Class']
# Splitting into train and test
xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104, test_size=0.25, shuffle=True)
# Applying Decision Tree
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100,max_depth = 3, min_samples_leaf = 5)
clf_entropy.fit(xtrain, y_train)
y_pred=clf_entropy.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[0,2]=acc*100
# Applying KNN
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(xtrain, y_train)
y_pred=knn.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[1,2]=acc*100
# Applying Logistic Regression
classifier = LogisticRegression(random_state = 0,max_iter=2000)
classifier.fit(xtrain, y_train)
y_pred = classifier.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[2,2]=acc*100
# Applying SVM
clf = SVC(kernel='rbf')
clf.fit(xtrain, y_train) 
y_pred=clf.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[3,2]=acc*100
# Applying RandomForest Classifier
clf = RandomForestClassifier(n_estimators = 100) 
clf.fit(xtrain, y_train)
y_pred = clf.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[4,2]=acc*100

In [None]:
#SAMPLE4
x_s=samples[3].drop('Class',axis=1)
y_s=samples[3]['Class']
# Splitting into train and test
xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104, test_size=0.25, shuffle=True)
# Applying Decision Tree
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100,max_depth = 3, min_samples_leaf = 5)
clf_entropy.fit(xtrain, y_train)
y_pred=clf_entropy.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[0,3]=acc*100
# Applying KNN
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(xtrain, y_train)
y_pred=knn.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[1,3]=acc*100
# Applying Logistic Regression
classifier = LogisticRegression(random_state = 0,max_iter=2000)
classifier.fit(xtrain, y_train)
y_pred = classifier.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[2,3]=acc*100
# Applying SVM
clf = SVC(kernel='rbf')
clf.fit(xtrain, y_train) 
y_pred=clf.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[3,3]=acc*100
# Applying RandomForest Classifier
clf = RandomForestClassifier(n_estimators = 100) 
clf.fit(xtrain, y_train)
y_pred = clf.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[4,3]=acc*100

In [None]:
#SAMPLE5
x_s=samples[4].drop('Class',axis=1)
y_s=samples[4]['Class']
# Splitting into train and test
xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104, test_size=0.25, shuffle=True)
# Applying Decision Tree
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100,max_depth = 3, min_samples_leaf = 5)
clf_entropy.fit(xtrain, y_train)
y_pred=clf_entropy.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[0,4]=acc*100
# Applying KNN
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(xtrain, y_train)
y_pred=knn.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[1,4]=acc*100
# Applying Logistic Regression
classifier = LogisticRegression(random_state = 0,max_iter=2000)
classifier.fit(xtrain, y_train)
y_pred = classifier.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[2,4]=acc*100
# Applying SVM
clf = SVC(kernel='rbf')
clf.fit(xtrain, y_train) 
y_pred=clf.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[3,4]=acc*100
# Applying RandomForest Classifier
clf = RandomForestClassifier(n_estimators = 100) 
clf.fit(xtrain, y_train)
y_pred = clf.predict(xtest)
acc = accuracy_score(y_test, y_pred)
ans.iloc[4,4]=acc*100

In [None]:
print(ans)

                    Simple Random    Cluster Systematic Stratified Convenience
Decision Tree            85.56701   95.37037       70.0  90.829694        99.0
KNN                     72.164948   79.62963       40.0  80.786026        99.0
Logistic Regression     86.597938  93.055556       80.0  92.139738        98.0
SVM                      76.28866  67.592593       60.0  70.742358        99.0
Random Forest           98.969072  99.537037       80.0      100.0        99.0
