##### Imbalanced Dataset
    *   Humidity and Temparature - as your feature - 
    *   total data of 200 days
    *   150 day data , is consisting of a rain output
    *   50 days data , is sunny in nature

In [109]:
import pandas as pd 
import numpy as np 

In [110]:
df = pd.read_csv(r'imbalanced.csv')

In [111]:
df.head()

Unnamed: 0.1,Unnamed: 0,Feature1,Feature2,Label
0,0,-0.122893,0.11047,0
1,1,0.336059,-1.332262,1
2,2,1.404399,1.416943,0
3,3,1.688211,1.124311,0
4,4,1.211533,1.00966,0


In [112]:
df['Label'].value_counts()

0    950
1     50
Name: Label, dtype: int64

In [113]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [114]:
df.head()

Unnamed: 0,Feature1,Feature2,Label
0,-0.122893,0.11047,0
1,0.336059,-1.332262,1
2,1.404399,1.416943,0
3,1.688211,1.124311,0
4,1.211533,1.00966,0


In [115]:
# random over sampler
from imblearn.over_sampling import RandomOverSampler
print("Class distribution before performing randomover sampling")
print(df['Label'].value_counts())

Class distribution before performing randomover sampling
0    950
1     50
Name: Label, dtype: int64


In [116]:
from sklearn.datasets import make_classification

In [117]:
X , y = make_classification(
    n_samples= 1000,
    n_features= 2,
    n_informative= 2,
    n_redundant= 0,
    n_clusters_per_class=1,
    weights=[0.95],
    random_state=1
)


In [118]:
df = pd.DataFrame(X , columns=['Feature1','Feature2'])
df['Label'] = y

In [119]:
df.head()

Unnamed: 0,Feature1,Feature2,Label
0,1.53683,-1.398694,1
1,1.551108,1.810329,0
2,1.293619,1.010946,0
3,1.119889,1.632518,0
4,1.042356,1.121529,0


In [120]:
df['Label'].value_counts()

0    943
1     57
Name: Label, dtype: int64

In [121]:
print("Class distribution before performing randomover sampling")
print(df['Label'].value_counts())

Class distribution before performing randomover sampling
0    943
1     57
Name: Label, dtype: int64


In [122]:
ros = RandomOverSampler(random_state=42)
X_resampled , y_resampled = ros.fit_resample(X,y)

In [123]:
df_resampled = pd.DataFrame(X_resampled,columns=['Feature1','Feature2'])
df_resampled['Label'] = y_resampled

In [124]:
print('The class distribution after performing Random Over sampling')
print(df_resampled['Label'].value_counts())

The class distribution after performing Random Over sampling
1    943
0    943
Name: Label, dtype: int64


In [125]:
print("Class distribution before performing randomover sampling")
print(df['Label'].value_counts())
print("**** Performing Random Over Sampling *****")
print('The class distribution after performing Random Over sampling')
print(df_resampled['Label'].value_counts())


Class distribution before performing randomover sampling
0    943
1     57
Name: Label, dtype: int64
**** Performing Random Over Sampling *****
The class distribution after performing Random Over sampling
1    943
0    943
Name: Label, dtype: int64


In [126]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
xtrain , xtest , ytrain , ytest = train_test_split(X_resampled,y_resampled,test_size=0.2,random_state=42)
lr_model = LogisticRegression(random_state=42)
lr_model.fit(xtrain,ytrain)
lr_predictions = lr_model.predict(xtest)
accuracy = accuracy_score(ytest,lr_predictions)
report = classification_report(ytest,lr_predictions)
matrix = confusion_matrix(ytest,lr_predictions)

print('Accuracy Score')
print(accuracy)
print('Classification report')
print(report)
print('Confusion Matrix')
print(matrix)

Accuracy Score
0.8915343915343915
Classification report
              precision    recall  f1-score   support

           0       0.86      0.94      0.90       190
           1       0.93      0.84      0.89       188

    accuracy                           0.89       378
   macro avg       0.90      0.89      0.89       378
weighted avg       0.90      0.89      0.89       378

Confusion Matrix
[[179  11]
 [ 30 158]]


In [127]:
from imblearn.under_sampling import RandomUnderSampler

In [128]:
rus = RandomUnderSampler(random_state=42)
X_resampled ,y_resampled = rus.fit_resample(X,y)


In [129]:
df_rus_resampled = pd.DataFrame(X_resampled,columns=['Feature1','Feature2'])
df_rus_resampled['Label'] = y_resampled

In [130]:
print("Class distribution before performing random under sampling")
print(df['Label'].value_counts())
print("**** Performing Random Over Sampling *****")
print('The class distribution after performing Random under sampling')
print(df_rus_resampled['Label'].value_counts())

Class distribution before performing random under sampling
0    943
1     57
Name: Label, dtype: int64
**** Performing Random Over Sampling *****
The class distribution after performing Random under sampling
0    57
1    57
Name: Label, dtype: int64


In [131]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
xtrain , xtest , ytrain , ytest = train_test_split(X_resampled,y_resampled,test_size=0.2,random_state=42)
lr_model = LogisticRegression(random_state=42)
lr_model.fit(xtrain,ytrain)
lr_predictions = lr_model.predict(xtest)
accuracy = accuracy_score(ytest,lr_predictions)
report = classification_report(ytest,lr_predictions)
matrix = confusion_matrix(ytest,lr_predictions)

print('Accuracy Score')
print(accuracy)
print('Classification report')
print(report)
print('Confusion Matrix')
print(matrix)

Accuracy Score
0.8695652173913043
Classification report
              precision    recall  f1-score   support

           0       0.91      0.83      0.87        12
           1       0.83      0.91      0.87        11

    accuracy                           0.87        23
   macro avg       0.87      0.87      0.87        23
weighted avg       0.87      0.87      0.87        23

Confusion Matrix
[[10  2]
 [ 1 10]]


In [132]:
# Neighbor based sampling - This combines both over sampling and under sampling 
from imblearn.combine import SMOTEENN

In [133]:
smote_enn = SMOTEENN(random_state=42)
X_resampled ,y_resampled = smote_enn.fit_resample(X,y)


In [134]:
df_smoteen_resampled = pd.DataFrame(X_resampled,columns=['Feature1','Feature2'])
df_smoteen_resampled['Label'] = y_resampled

In [135]:
print("Class distribution before performing smoteenn sampling")
print(df['Label'].value_counts())
print("**** Performing smoteen Sampling *****")
print('The class distribution after performing smoteen sampling')
print(df_smoteen_resampled['Label'].value_counts())

Class distribution before performing smoteenn sampling
0    943
1     57
Name: Label, dtype: int64
**** Performing smoteen Sampling *****
The class distribution after performing smoteen sampling
1    802
0    713
Name: Label, dtype: int64


In [136]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
xtrain , xtest , ytrain , ytest = train_test_split(X_resampled,y_resampled,test_size=0.2,random_state=42)
lr_model = LogisticRegression(random_state=42)
lr_model.fit(xtrain,ytrain)
lr_predictions = lr_model.predict(xtest)
accuracy = accuracy_score(ytest,lr_predictions)
report = classification_report(ytest,lr_predictions)
matrix = confusion_matrix(ytest,lr_predictions)

print('Accuracy Score')
print(accuracy)
print('Classification report')
print(report)
print('Confusion Matrix')
print(matrix)

Accuracy Score
0.976897689768977
Classification report
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       137
           1       0.97      0.99      0.98       166

    accuracy                           0.98       303
   macro avg       0.98      0.98      0.98       303
weighted avg       0.98      0.98      0.98       303

Confusion Matrix
[[132   5]
 [  2 164]]


    Random Over sampling - f1 , f2 - outputs
                            f1 , f2 - output
    synthetic sampling    - f1 (weightage) , f2(weightage) - output 


In [137]:
from imblearn.over_sampling import SMOTE

In [138]:
smote = SMOTE(random_state=42)
X_resampled ,y_resampled = smote.fit_resample(X,y)

In [139]:
df_smote_resampled = pd.DataFrame(X_resampled,columns=['Feature1','Feature2'])
df_smote_resampled['Label'] = y_resampled

In [140]:
print("Class distribution before performing synthetic sampling")
print(df['Label'].value_counts())
print("**** Performing synthetic Sampling *****")
print('The class distribution after performing synthetic sampling')
print(df_smote_resampled['Label'].value_counts())

Class distribution before performing synthetic sampling
0    943
1     57
Name: Label, dtype: int64
**** Performing synthetic Sampling *****
The class distribution after performing synthetic sampling
1    943
0    943
Name: Label, dtype: int64


In [141]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
xtrain , xtest , ytrain , ytest = train_test_split(X_resampled,y_resampled,test_size=0.2,random_state=42)
lr_model = LogisticRegression(random_state=42)
lr_model.fit(xtrain,ytrain)
lr_predictions = lr_model.predict(xtest)
accuracy = accuracy_score(ytest,lr_predictions)
report = classification_report(ytest,lr_predictions)
matrix = confusion_matrix(ytest,lr_predictions)

print('Accuracy Score')
print(accuracy)
print('Classification report')
print(report)
print('Confusion Matrix')
print(matrix)

Accuracy Score
0.91005291005291
Classification report
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       190
           1       0.93      0.88      0.91       188

    accuracy                           0.91       378
   macro avg       0.91      0.91      0.91       378
weighted avg       0.91      0.91      0.91       378

Confusion Matrix
[[178  12]
 [ 22 166]]
