In [1]:
import numpy as np
import pandas as pd 
from collections import Counter
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report ,balanced_accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('..\datasets\clean-dataset.csv')

In [3]:
df.shape

(91121, 10)

**check for any nan vlaues and drop all of them**

In [4]:
df.isna().sum()

Unnamed: 0               0
lead_mobile_network      0
method_of_contact        0
ad_group                72
lead_source              0
campaign               780
low_qualified            0
month                    0
year                     0
day                      0
dtype: int64

In [5]:
df.dropna(inplace=True)

# Feature Engineering 
**use label encoding to encode all categorical data into numbers in many times using one hot encoding is better as when u use label encoding model may understand that 4 is big than 2 and we don't need that but one hot encoding will make our data has big number of dimensions so i will use label encoding to save dimensions number and get the advantage of tree based model which can work will with categorical data also don't need to normalize or scaling data**

In [6]:
cat_cols = ['lead_mobile_network', 'method_of_contact', 'ad_group','lead_source', 'campaign']
le = LabelEncoder()
df[cat_cols]=df[cat_cols].apply(le.fit_transform)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,lead_mobile_network,method_of_contact,ad_group,lead_source,campaign,low_qualified,month,year,day
0,0,3,3,3,4,0,0,3,2022,7
1,1,6,2,1,2,2,0,9,2019,16
2,2,6,3,1,4,2,0,5,2022,7
3,3,3,2,1,2,0,0,12,2019,19
4,4,6,3,3,4,1,0,1,2022,27


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90290 entries, 0 to 91120
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Unnamed: 0           90290 non-null  int64
 1   lead_mobile_network  90290 non-null  int32
 2   method_of_contact    90290 non-null  int32
 3   ad_group             90290 non-null  int32
 4   lead_source          90290 non-null  int32
 5   campaign             90290 non-null  int32
 6   low_qualified        90290 non-null  int64
 7   month                90290 non-null  int64
 8   year                 90290 non-null  int64
 9   day                  90290 non-null  int64
dtypes: int32(5), int64(5)
memory usage: 5.9 MB


In [9]:
data = df.drop(['low_qualified','Unnamed: 0'],axis=1)
target = df['low_qualified']

In [10]:
target.value_counts()

0    74454
1    15836
Name: low_qualified, dtype: int64

**split data into train an split**

In [11]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

# baseline model
**bulid simple model without any addition **

In [12]:
baseline_dt = DecisionTreeClassifier(random_state=42)
baseline_dt.fit(x_train , y_train)

DecisionTreeClassifier(random_state=42)

In [13]:
baseline_y_pred= baseline_dt.predict(x_test)
print(classification_report(baseline_y_pred, y_test))

              precision    recall  f1-score   support

           0       0.93      0.84      0.88     16348
           1       0.20      0.37      0.26      1710

    accuracy                           0.80     18058
   macro avg       0.56      0.61      0.57     18058
weighted avg       0.86      0.80      0.82     18058



In [14]:
basline_rf = RandomForestClassifier(random_state=42)
basline_rf.fit(x_train , y_train)

RandomForestClassifier(random_state=42)

In [15]:
y_pred= basline_rf.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.93      0.85      0.89     16368
           1       0.21      0.40      0.28      1690

    accuracy                           0.80     18058
   macro avg       0.57      0.62      0.58     18058
weighted avg       0.86      0.80      0.83     18058



# try sampling methids :


*   **under sampling**
*   **over sampling using smote**
*   **try to make mix between two methods (doing over by ratio then make under sample )**







In [16]:
rus = RandomUnderSampler(random_state=42)
x_resampled, y_resampled = rus.fit_resample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

[(0, 12640), (1, 12640)]


In [17]:
undersample_dt = DecisionTreeClassifier()
undersample_dt.fit(x_resampled , y_resampled)
y_pred= undersample_dt.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.63      0.87      0.73     10745
           1       0.57      0.25      0.35      7313

    accuracy                           0.62     18058
   macro avg       0.60      0.56      0.54     18058
weighted avg       0.61      0.62      0.58     18058



In [18]:
undersample_rf = RandomForestClassifier()
undersample_rf.fit(x_resampled , y_resampled)
y_pred= undersample_rf.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.61      0.88      0.73     10336
           1       0.62      0.26      0.37      7722

    accuracy                           0.62     18058
   macro avg       0.62      0.57      0.55     18058
weighted avg       0.62      0.62      0.57     18058



**SMOTE**

In [19]:
oversample = SMOTE()
x_s, y_s = oversample.fit_resample(x_train, y_train)

In [20]:
oversample_dt = DecisionTreeClassifier()
oversample_dt.fit(x_s , y_s)
y_pred= oversample_dt.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.72      0.87      0.79     12264
           1       0.49      0.27      0.35      5794

    accuracy                           0.68     18058
   macro avg       0.61      0.57      0.57     18058
weighted avg       0.65      0.68      0.65     18058



In [21]:
oversample_rf = RandomForestClassifier()
oversample_rf.fit(x_s , y_s)
y_pred= oversample_rf.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.71      0.87      0.78     12109
           1       0.51      0.27      0.35      5949

    accuracy                           0.67     18058
   macro avg       0.61      0.57      0.57     18058
weighted avg       0.64      0.67      0.64     18058



**try both over and under sampling**

In [23]:
over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler()
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
x_pipe, y_pipe = pipeline.fit_resample(x_train, y_train)
Counter(y_pipe)

Counter({0: 29796, 1: 29796})

In [24]:
dt = DecisionTreeClassifier()
dt.fit(x_pipe , y_pipe)
y_pred= dt.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.68      0.87      0.76     11743
           1       0.51      0.26      0.34      6315

    accuracy                           0.65     18058
   macro avg       0.60      0.56      0.55     18058
weighted avg       0.62      0.65      0.62     18058



In [25]:
rf = RandomForestClassifier()
rf.fit(x_pipe , y_pipe)
y_pred= rf.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.68      0.87      0.76     11526
           1       0.53      0.26      0.35      6532

    accuracy                           0.65     18058
   macro avg       0.60      0.57      0.56     18058
weighted avg       0.62      0.65      0.61     18058



# class wieght

In [26]:
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
rf.fit(x_train , y_train)
y_pred= rf.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.78      0.87      0.82     13348
           1       0.44      0.30      0.36      4710

    accuracy                           0.72     18058
   macro avg       0.61      0.58      0.59     18058
weighted avg       0.69      0.72      0.70     18058



# BalancedRandomForestClassifier


In [27]:
brf = BalancedRandomForestClassifier()
brf.fit(x_train, y_train) 
y_pred = brf.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.61      0.89      0.72     10116
           1       0.65      0.26      0.37      7942

    accuracy                           0.61     18058
   macro avg       0.63      0.58      0.55     18058
weighted avg       0.63      0.61      0.57     18058



# Threshold moving

In [28]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(x_train,y_train)   
rf_model.predict_proba(x_test)

array([[0.80833333, 0.19166667],
       [0.85932445, 0.14067555],
       [0.99571429, 0.00428571],
       ...,
       [1.        , 0.        ],
       [0.26858888, 0.73141112],
       [0.98833333, 0.01166667]])

In [30]:
from sklearn.metrics import roc_auc_score
threshold_value = 0.2 
roc_score=0
predicted_proba = rf_model.predict_proba(x_test) #probability of prediction
while threshold_value <=0.8: #continue to check best threshold upto probability 0.8
    temp_thresh = threshold_value
    predicted = (predicted_proba [:,1] >= temp_thresh).astype('int') #change the class boundary for prediction
    print('Threshold',temp_thresh,'--',roc_auc_score(y_test, predicted))
    if roc_score<roc_auc_score(y_test, predicted): #store the threshold for best classification
        roc_score = roc_auc_score(y_test, predicted)
        thrsh_score = threshold_value
    threshold_value = threshold_value + 0.05
print('---Optimum Threshold ---',thrsh_score,'--ROC--',roc_score)

Threshold 0.2 -- 0.6185160674702885
Threshold 0.25 -- 0.6080902584966507
Threshold 0.3 -- 0.6009819121903994
Threshold 0.35 -- 0.5895454072334059
Threshold 0.39999999999999997 -- 0.5832634160012625
Threshold 0.44999999999999996 -- 0.5758544525361318
Threshold 0.49999999999999994 -- 0.5692933393562031
Threshold 0.5499999999999999 -- 0.5645828354276111
Threshold 0.6 -- 0.5598842264983025
Threshold 0.65 -- 0.5534360631788254
Threshold 0.7000000000000001 -- 0.5461687028379069
Threshold 0.7500000000000001 -- 0.539421206598411
---Optimum Threshold --- 0.2 --ROC-- 0.6185160674702885
