<a href="https://colab.research.google.com/github/Shivi1771/Sampling-Assignment/blob/main/Sampling_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Fixing Imbalances**



In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/content/Creditcard_data.csv')

class_counts = df['Class'].value_counts()

print(class_counts)

0    763
1      9
Name: Class, dtype: int64


In [3]:
from imblearn.over_sampling import RandomOverSampler
target_var = 'Class'
oversampling_rate = 1.0
ros = RandomOverSampler(sampling_strategy=oversampling_rate, random_state=42)
X = df.drop(target_var, axis=1)
y = df[target_var]
X_resampled, y_resampled = ros.fit_resample(X, y)
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)


In [4]:
num_rows, num_cols = df.shape
print("Number of rows in original df: ", num_rows)
print("Number of columns in original df: ", num_cols)

Number of rows in original df:  772
Number of columns in original df:  31


In [5]:
num_rows, num_cols = df_resampled.shape
print("Number of rows in balanced df: ", num_rows)
print("Number of columns in balanced df: ", num_cols)

Number of rows in balanced df:  1526
Number of columns in balanced df:  31


In [6]:
class_counts = df_resampled['Class'].value_counts()

print(class_counts)

0    763
1    763
Name: Class, dtype: int64


In [7]:
shuffled_df = df_resampled.sample(frac=1, random_state=1).reset_index(drop=True)
shuffled_df.to_csv('balanced_df.csv')

Performing Simple Random Sampling

In [8]:
import random
n = int(1526/2)

df_srs = pd.read_csv('balanced_df.csv')
df_srs = df_srs.sample(n)
num_rows, num_cols = df_srs.shape
print("Number of rows in Simple Random Sampling df: ", num_rows)
print("Number of columns in Simple Random Sampling df: ", num_cols)
df_srs.to_csv('df_srs.csv')

Number of rows in Simple Random Sampling df:  763
Number of columns in Simple Random Sampling df:  32


Systematic Sampling

In [9]:
n = int(1526/2)
df = pd.read_csv('balanced_df.csv')
k = int(len(df) / n)
start_idx = k // 2  
idx = range(start_idx, len(df), k)
df_sys_s = df.iloc[idx]
df_sys_s.to_csv('df_sys_s.csv')

Clustering Sampling

In [10]:
import pandas as pd
import random

df = pd.read_csv('/content/balanced_df.csv')

sample_size = 2
selected_clusters = random.sample(list(df['Class'].unique()), sample_size)

proportion = 0.5

sampled_df = df[df['Class'].isin(selected_clusters)].groupby('Class').apply(lambda x: x.sample(frac=proportion))


sampled_df.reset_index(drop=True, inplace=True)


sampled_df.to_csv('df_cluster_s.csv')


In [20]:
sampled_df.to_csv('df_convenience_s.csv')

Stratified Sampling

In [11]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
df = pd.read_csv('/content/balanced_df.csv')
strat_var = 'Class'
test_size = 0.5
splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
for train_idx, test_idx in splitter.split(df, df[strat_var]):
    train_set = df.loc[train_idx]
    test_set = df.loc[test_idx]
train_set.to_csv('df_stratified_s.csv')

Training 

In [12]:
!pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:

!pip uninstall scikit-learn -y

!pip install scikit-learn==0.23.2


Running all models and comparing the results

1.On Random Sampled Data

In [14]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_srs.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)



Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9981,1.0,1.0,0.9964,0.9982,0.9962,0.9963,0.39
et,Extra Trees Classifier,0.9981,1.0,1.0,0.9964,0.9982,0.9962,0.9963,0.177
ada,Ada Boost Classifier,0.9926,1.0,1.0,0.9857,0.9927,0.9851,0.9854,0.242
gbc,Gradient Boosting Classifier,0.9925,1.0,1.0,0.9856,0.9927,0.985,0.9853,0.334
lightgbm,Light Gradient Boosting Machine,0.9925,1.0,1.0,0.986,0.9928,0.985,0.9854,0.165
dt,Decision Tree Classifier,0.9869,0.9868,1.0,0.976,0.9875,0.9738,0.9747,0.031
knn,K Neighbors Classifier,0.957,0.9831,1.0,0.9217,0.9591,0.9139,0.9176,0.04
lr,Logistic Regression,0.927,0.9422,1.0,0.8745,0.9327,0.8539,0.8639,0.532
lda,Linear Discriminant Analysis,0.8728,0.9315,0.9142,0.849,0.8784,0.7455,0.7515,0.019
ridge,Ridge Classifier,0.856,0.0,0.9142,0.8229,0.8647,0.7117,0.7192,0.023


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=8628, verbose=0,
                       warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=8628, verbose=0,
                       warm_start=False)


2.On Systematic Sampled Data

In [15]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_sys_s.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.187
rf,Random Forest Classifier,0.9981,1.0,1.0,0.9963,0.9981,0.9963,0.9964,0.409
gbc,Gradient Boosting Classifier,0.9963,1.0,1.0,0.9926,0.9962,0.9926,0.9927,0.303
ada,Ada Boost Classifier,0.9944,1.0,1.0,0.9892,0.9944,0.9889,0.9892,0.251
lightgbm,Light Gradient Boosting Machine,0.9926,1.0,1.0,0.9854,0.9925,0.9852,0.9856,0.102
dt,Decision Tree Classifier,0.9851,0.9856,1.0,0.9713,0.9851,0.9702,0.9712,0.028
knn,K Neighbors Classifier,0.9549,0.9871,1.0,0.9175,0.9564,0.91,0.9148,0.043
lr,Logistic Regression,0.927,0.9473,1.0,0.8727,0.9311,0.8546,0.8654,0.237
lda,Linear Discriminant Analysis,0.8706,0.9367,0.9188,0.8332,0.8731,0.7416,0.7472,0.017
ridge,Ridge Classifier,0.8688,0.0,0.9188,0.8331,0.8722,0.7382,0.7452,0.027


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=6041, verbose=0,
                     warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=6041, verbose=0,
                     warm_start=False)


3.On Clustering Sampled Data

In [16]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_cluster_s.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.399
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.177
lightgbm,Light Gradient Boosting Machine,0.9963,1.0,1.0,0.993,0.9964,0.9925,0.9926,0.096
gbc,Gradient Boosting Classifier,0.9944,1.0,1.0,0.9897,0.9947,0.9887,0.9891,0.342
ada,Ada Boost Classifier,0.9925,1.0,1.0,0.9861,0.9929,0.9849,0.9853,0.229
dt,Decision Tree Classifier,0.9869,0.9866,1.0,0.9768,0.9878,0.9738,0.975,0.032
knn,K Neighbors Classifier,0.9701,0.9925,1.0,0.9457,0.9717,0.9401,0.9425,0.043
lr,Logistic Regression,0.9137,0.9441,1.0,0.859,0.9231,0.8267,0.8413,0.268
lda,Linear Discriminant Analysis,0.8745,0.9323,0.9601,0.8266,0.8863,0.7484,0.7644,0.016
ridge,Ridge Classifier,0.8708,0.0,0.9601,0.8194,0.8827,0.7409,0.7566,0.022


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=7964, verbose=0,
                       warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=7964, verbose=0,
                       warm_start=False)


4.On Stratified Sampled Data

In [17]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_stratified_s.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.408
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.186
lightgbm,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.099
ada,Ada Boost Classifier,0.9944,0.9962,1.0,0.9892,0.9945,0.9887,0.9889,0.248
gbc,Gradient Boosting Classifier,0.9944,1.0,1.0,0.9893,0.9945,0.9888,0.9891,0.351
dt,Decision Tree Classifier,0.9813,0.9813,1.0,0.9652,0.982,0.9626,0.9639,0.028
knn,K Neighbors Classifier,0.9588,0.9887,1.0,0.9255,0.961,0.9177,0.9214,0.045
lr,Logistic Regression,0.9121,0.9386,1.0,0.8552,0.9209,0.8237,0.8386,0.251
lda,Linear Discriminant Analysis,0.8672,0.9229,0.9107,0.8416,0.8732,0.7341,0.7403,0.019
ridge,Ridge Classifier,0.8597,0.0,0.907,0.8342,0.8669,0.7192,0.7266,0.022


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=2256, verbose=0,
                       warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=2256, verbose=0,
                       warm_start=False)


In [21]:
# Import necessary libraries
from pycaret.classification import *
import pandas as pd

# Load the dataset
df=pd.read_csv('df_convenience_s.csv')
df=df.iloc[:,2:]

# Initialize the PyCaret setup
clf = setup(df, target='Class', silent=True,preprocess=False)

# Compare the performance of several models
best_model = compare_models()

# Print the model's performance metrics
print(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.217
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.296
lightgbm,Light Gradient Boosting Machine,0.9962,1.0,1.0,0.9929,0.9964,0.9924,0.9926,0.178
ada,Ada Boost Classifier,0.9944,1.0,1.0,0.9897,0.9947,0.9887,0.9891,0.159
gbc,Gradient Boosting Classifier,0.9943,1.0,1.0,0.9893,0.9945,0.9887,0.9889,0.452
dt,Decision Tree Classifier,0.9869,0.9867,1.0,0.9755,0.9874,0.9737,0.9744,0.016
knn,K Neighbors Classifier,0.9718,0.9924,1.0,0.9502,0.9737,0.9434,0.9464,0.02
lr,Logistic Regression,0.9175,0.9469,1.0,0.864,0.9263,0.8342,0.8473,0.653
lda,Linear Discriminant Analysis,0.8688,0.9328,0.9229,0.8414,0.8771,0.7368,0.7482,0.029
ridge,Ridge Classifier,0.8538,0.0,0.9229,0.8175,0.8647,0.7065,0.7186,0.012


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=4353, verbose=0,
                       warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=4353, verbose=0,
                       warm_start=False)
