## Handling Imbalance in Datasets

### Initialisation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

### Import Dataset

In [2]:
df = pd.read_csv('emails.csv')
df.sample(5)

Unnamed: 0,text,spam
4311,Subject: internship opportunities please resp...,0
4058,Subject: re : marketpoint license agreement d...,0
2093,"Subject: re : sms conference yes , i shall b...",0
3446,Subject: re : visit to houston and vince kamin...,0
3131,"Subject: re : summer internship vince , i ap...",0


In [3]:
df1 = df[df['spam']==1]
df1.shape

(1368, 2)

In [4]:
df0 = df[df['spam']==0]
df0.shape

(4360, 2)

### Model Training Function

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report

def train_model_accuracy(X_train,Y_train,X_test,Y_test):
    vectorizer = CountVectorizer(stop_words=stopwords.words('english'))
    X_train = vectorizer.fit_transform(X_train)
    model = MultinomialNB()
    model.fit(X_train,Y_train)
    X_test = vectorizer.transform(X_test)
    Y_pred = model.predict(X_test)
    print(classification_report(Y_test,Y_pred))

#### Imbalanced Accuracy

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(df['text'],df['spam'],random_state=1234,test_size=0.2)
train_model_accuracy(X_train,Y_train,X_test,Y_test)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       874
           1       0.99      0.98      0.98       272

    accuracy                           0.99      1146
   macro avg       0.99      0.99      0.99      1146
weighted avg       0.99      0.99      0.99      1146



### Under-Sampling

In [7]:
count0,count1 = df['spam'].value_counts()
print('Spam: ',count1)
print("Not Spam: ",count0)

Spam:  1368
Not Spam:  4360


In [8]:
df0_under = df0.sample(count1)
df_under = pd.concat([df0_under,df1],axis=0)
X_train,X_test,Y_train,Y_test = train_test_split(df_under['text'],df_under['spam'],random_state=1234,test_size=0.2)
train_model_accuracy(X_train,Y_train,X_test,Y_test)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       273
           1       0.98      1.00      0.99       275

    accuracy                           0.99       548
   macro avg       0.99      0.99      0.99       548
weighted avg       0.99      0.99      0.99       548



### Over-Sampling

In [9]:
df1_under = df1.sample(count0,replace=True)
df_under = pd.concat([df1_under,df0],axis=0)
X_train,X_test,Y_train,Y_test = train_test_split(df_under['text'],df_under['spam'],random_state=1234,test_size=0.2)
train_model_accuracy(X_train,Y_train,X_test,Y_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       884
           1       1.00      1.00      1.00       860

    accuracy                           1.00      1744
   macro avg       1.00      1.00      1.00      1744
weighted avg       1.00      1.00      1.00      1744



### SMOTE

In [20]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')
X_train,X_test,Y_train,Y_test = train_test_split(df_under['text'],df_under['spam'],random_state=1234,test_size=0.2)
vectorizer = CountVectorizer(stop_words=stopwords.words('english'))
X_train = vectorizer.fit_transform(X_train)
X_sm,Y_sm = smote.fit_resample(X_train,Y_train)
model = MultinomialNB()
model.fit(X_train,Y_train)
X_test = vectorizer.transform(X_test)
Y_pred = model.predict(X_test)
print(classification_report(Y_test,Y_pred))

spam
0    3500
1    3500
Name: count, dtype: int64