<a href="https://colab.research.google.com/github/RishabJalota/Sampling_Assignment/blob/main/Sampling_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Fixing Imbalances**



In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('/content/Creditcard_data.csv')

class_counts = df['Class'].value_counts()

print(class_counts)

0    763
1      9
Name: Class, dtype: int64


In [4]:
from imblearn.over_sampling import RandomOverSampler
target_var = 'Class'
oversampling_rate = 1.0
ros = RandomOverSampler(sampling_strategy=oversampling_rate, random_state=42)
X = df.drop(target_var, axis=1)
y = df[target_var]
X_resampled, y_resampled = ros.fit_resample(X, y)
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)


In [5]:
num_rows, num_cols = df.shape
print("Number of rows in original df: ", num_rows)
print("Number of columns in original df: ", num_cols)

Number of rows in original df:  772
Number of columns in original df:  31


In [6]:
num_rows, num_cols = df_resampled.shape
print("Number of rows in balanced df: ", num_rows)
print("Number of columns in balanced df: ", num_cols)

Number of rows in balanced df:  1526
Number of columns in balanced df:  31


In [7]:
class_counts = df_resampled['Class'].value_counts()

print(class_counts)

0    763
1    763
Name: Class, dtype: int64


In [8]:
shuffled_df = df_resampled.sample(frac=1, random_state=1).reset_index(drop=True)
shuffled_df.to_csv('balanced_df.csv')

Performing Simple Random Sampling

In [9]:
import random
n = int(1526/2)

df_srs = pd.read_csv('balanced_df.csv')
df_srs = df_srs.sample(n)
num_rows, num_cols = df_srs.shape
print("Number of rows in Simple Random Sampling df: ", num_rows)
print("Number of columns in Simple Random Sampling df: ", num_cols)
df_srs.to_csv('df_srs.csv')

Number of rows in Simple Random Sampling df:  763
Number of columns in Simple Random Sampling df:  32


Systematic Sampling

In [10]:
n = int(1526/2)
df = pd.read_csv('balanced_df.csv')
k = int(len(df) / n)
start_idx = k // 2  
idx = range(start_idx, len(df), k)
df_sys_s = df.iloc[idx]
df_sys_s.to_csv('df_sys_s.csv')

Clustering Sampling

In [11]:
import pandas as pd
import random

df = pd.read_csv('/content/balanced_df.csv')

sample_size = 2
selected_clusters = random.sample(list(df['Class'].unique()), sample_size)

proportion = 0.5

sampled_df = df[df['Class'].isin(selected_clusters)].groupby('Class').apply(lambda x: x.sample(frac=proportion))


sampled_df.reset_index(drop=True, inplace=True)


sampled_df.to_csv('df_cluster_s.csv')


In [12]:
convenience_sample = pd.concat([df_resampled.head(380), df_resampled.tail(380)])
convenience_sample.to_csv('df_convenience_s.csv')

Stratified Sampling

In [13]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
df = pd.read_csv('/content/balanced_df.csv')
strat_var = 'Class'
test_size = 0.5
splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
for train_idx, test_idx in splitter.split(df, df[strat_var]):
    train_set = df.loc[train_idx]
    test_set = df.loc[test_idx]
train_set.to_csv('df_stratified_s.csv')

Training 

In [16]:
!pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-2.3.10-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.2/320.2 KB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 KB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pandas-profiling>=2.8.0
  Downloading pandas_profiling-3.6.6-py2.py3-none-any.whl (324 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 KB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Collecting kmodes>=0.10.1
  Downloading kmodes-0.12.2-py2.py3-none-any.whl (20 kB)
Collecting spacy<2.4.0
  Downloading spacy-2.3.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [1]:

!pip uninstall scikit-learn -y

!pip install scikit-learn==0.23.2


Found existing installation: scikit-learn 0.23.2
Uninstalling scikit-learn-0.23.2:
  Successfully uninstalled scikit-learn-0.23.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn==0.23.2
  Using cached scikit_learn-0.23.2-cp38-cp38-manylinux1_x86_64.whl (6.8 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.23.2


Running all models and comparing the results

1.Random Sampling

In [14]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_srs.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)



Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.139
rf,Random Forest Classifier,0.9981,1.0,1.0,0.9963,0.9981,0.9963,0.9964,0.301
lightgbm,Light Gradient Boosting Machine,0.9944,1.0,1.0,0.989,0.9943,0.9888,0.9891,0.129
gbc,Gradient Boosting Classifier,0.9888,1.0,1.0,0.9779,0.9886,0.9776,0.9782,0.202
dt,Decision Tree Classifier,0.9869,0.9875,1.0,0.9741,0.9866,0.9738,0.9745,0.021
ada,Ada Boost Classifier,0.9794,1.0,1.0,0.9603,0.9793,0.9589,0.9605,0.197
knn,K Neighbors Classifier,0.9514,0.9875,1.0,0.9094,0.952,0.9031,0.9083,0.016
lr,Logistic Regression,0.9193,0.9438,1.0,0.8588,0.9229,0.8398,0.8525,0.413
lda,Linear Discriminant Analysis,0.8652,0.9187,0.9408,0.8124,0.8683,0.7317,0.748,0.013
ridge,Ridge Classifier,0.8503,0.0,0.9057,0.808,0.8509,0.7015,0.7128,0.017


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=4822, verbose=0,
                     warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=4822, verbose=0,
                     warm_start=False)


2.Systematic Sampling

In [15]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_sys_s.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9981,1.0,1.0,0.9962,0.998,0.9962,0.9963,0.226
rf,Random Forest Classifier,0.9963,1.0,1.0,0.9925,0.9962,0.9925,0.9927,0.179
ada,Ada Boost Classifier,0.9944,1.0,1.0,0.9889,0.9943,0.9888,0.9891,0.104
gbc,Gradient Boosting Classifier,0.9926,1.0,1.0,0.985,0.9924,0.9851,0.9854,0.193
lightgbm,Light Gradient Boosting Machine,0.9925,1.0,1.0,0.9856,0.9925,0.985,0.9856,0.146
dt,Decision Tree Classifier,0.9888,0.9893,1.0,0.9774,0.9885,0.9776,0.978,0.013
knn,K Neighbors Classifier,0.9626,0.9856,1.0,0.9304,0.9631,0.9254,0.9294,0.02
lr,Logistic Regression,0.9343,0.9445,1.0,0.8849,0.9375,0.8693,0.8792,0.28
lda,Linear Discriminant Analysis,0.8651,0.9334,0.8983,0.8357,0.8643,0.7306,0.7356,0.013
ridge,Ridge Classifier,0.865,0.0,0.8943,0.8401,0.8641,0.7303,0.7357,0.011


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=610, verbose=0,
                     warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=610, verbose=0,
                     warm_start=False)


3.Cluster Sampling

In [16]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_cluster_s.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9981,1.0,1.0,0.9964,0.9982,0.9962,0.9963,0.329
et,Extra Trees Classifier,0.9981,1.0,1.0,0.9964,0.9982,0.9962,0.9963,0.148
gbc,Gradient Boosting Classifier,0.9944,1.0,1.0,0.9893,0.9945,0.9888,0.989,0.196
lightgbm,Light Gradient Boosting Machine,0.9925,1.0,1.0,0.9856,0.9927,0.985,0.9852,0.075
ada,Ada Boost Classifier,0.9887,1.0,1.0,0.9787,0.9891,0.9775,0.978,0.119
dt,Decision Tree Classifier,0.9832,0.9832,1.0,0.9685,0.9838,0.9664,0.9674,0.021
knn,K Neighbors Classifier,0.9626,0.9887,1.0,0.932,0.9645,0.9253,0.9285,0.035
lr,Logistic Regression,0.9308,0.9518,1.0,0.8812,0.9363,0.8614,0.8708,0.185
lda,Linear Discriminant Analysis,0.8542,0.9228,0.9219,0.8163,0.8638,0.7082,0.7198,0.013
ridge,Ridge Classifier,0.8392,0.0,0.8997,0.8091,0.8497,0.6781,0.6877,0.02


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=2792, verbose=0,
                       warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=2792, verbose=0,
                       warm_start=False)


4.Stratified Sampling 

In [17]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_stratified_s.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.294
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.156
ada,Ada Boost Classifier,0.9944,0.9982,1.0,0.9895,0.9947,0.9887,0.9889,0.256
gbc,Gradient Boosting Classifier,0.9926,1.0,1.0,0.9864,0.993,0.9851,0.9855,0.438
lightgbm,Light Gradient Boosting Machine,0.9852,1.0,1.0,0.9743,0.9864,0.9701,0.9717,0.075
dt,Decision Tree Classifier,0.9776,0.9768,1.0,0.9594,0.9791,0.9551,0.9565,0.013
knn,K Neighbors Classifier,0.9456,0.9864,1.0,0.9069,0.9508,0.8904,0.8967,0.023
lr,Logistic Regression,0.9384,0.9575,1.0,0.8969,0.9449,0.8758,0.8841,0.175
lda,Linear Discriminant Analysis,0.8596,0.9299,0.9205,0.8321,0.8724,0.7175,0.7256,0.015
ridge,Ridge Classifier,0.8485,0.0,0.9098,0.8219,0.8623,0.6951,0.7022,0.011


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=8512, verbose=0,
                       warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=8512, verbose=0,
                       warm_start=False)


5.Convenience Sampling

In [18]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_convenience_s.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.152
rf,Random Forest Classifier,0.9962,1.0,1.0,0.9925,0.9962,0.9925,0.9926,0.342
lightgbm,Light Gradient Boosting Machine,0.9906,1.0,1.0,0.982,0.9906,0.9811,0.9819,0.076
ada,Ada Boost Classifier,0.9849,1.0,1.0,0.9709,0.9849,0.9698,0.9709,0.19
gbc,Gradient Boosting Classifier,0.9849,1.0,1.0,0.9705,0.9848,0.9698,0.9706,0.192
dt,Decision Tree Classifier,0.9811,0.9817,1.0,0.9638,0.9813,0.9623,0.9636,0.021
knn,K Neighbors Classifier,0.9453,0.9837,1.0,0.9017,0.9473,0.8911,0.8982,0.021
lr,Logistic Regression,0.9284,0.9601,1.0,0.8747,0.932,0.8575,0.8682,0.11
lda,Linear Discriminant Analysis,0.8606,0.9249,0.9097,0.8257,0.8639,0.7219,0.7287,0.014
ridge,Ridge Classifier,0.8605,0.0,0.9097,0.8254,0.8638,0.7217,0.7282,0.018


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=438, verbose=0,
                     warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=438, verbose=0,
                     warm_start=False)
