# Logistic Regression Model

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
sm = pd.read_csv(r"C:\Users\Suraj\OneDrive\Desktop\Python Dataset\modeldata.csv")

In [11]:
sm.columns

Index(['skill', 'job', 'purpose', 'target_group', 'business_group',
       'business_segment', 'signup_source', 'signup_category',
       'first_cc_subscription_dts', 'country_code', 'product_id',
       'download_attempts', 'product_platforms', 'machines', 'label'],
      dtype='object')

In [13]:
sm.shape

(308948, 15)

In [9]:
sm = sm.drop(['error_codes'],axis = 1)
sm = sm.drop(['id'],axis = 1)
sm = sm.drop(['extract_date'],axis = 1)

In [15]:
sm.business_segment.fillna('INDIVIDUAL' ,inplace = True)
sm.job.fillna('STUDENT' ,inplace = True)
sm.purpose.fillna('ME_PROFESSIONAL' ,inplace = True)
sm.product_id.fillna('APRO' ,inplace = True)
sm.download_attempts.fillna(1 ,inplace = True)
sm.product_platforms.fillna('WIN' ,inplace = True)
sm.machines.fillna(1,inplace = True)

In [17]:
sm.isnull().sum()[sm.isnull().sum()>0]

Series([], dtype: int64)

In [27]:
sm.select_dtypes(include = 'object').columns

Index([], dtype='object')

In [29]:
sm.skill.value_counts()

skill
0    131207
4    102651
3     32897
2     22926
1     19267
Name: count, dtype: int64

In [23]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [24]:
sm[sm.select_dtypes(include='object').columns]=sm[sm.select_dtypes(include='object').columns].apply(le.fit_transform)

In [31]:
from sklearn.model_selection import train_test_split

In [33]:
sm_train , sm_test = train_test_split(sm,test_size= .2,random_state = 450)

In [69]:
sm_train.label.value_counts()

label
0    233827
1    176648
Name: count, dtype: int64

In [67]:
df0 = sm_train[sm_train.label == 1]
sm_train = pd.concat([sm_train , df0,df0.iloc[:10000]])

In [71]:
sm_train_x = sm_train.iloc[ : ,0:-1 ]
sm_train_y = sm_train.iloc[ : ,-1]
sm_test_x = sm_test.iloc[ : ,0:-1]
sm_test_y = sm_test.iloc[ : ,-1]

In [73]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(sm_train_x , sm_train_y)

In [74]:
logreg.predict(sm_test_x)

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [77]:
pred = logreg.predict(sm_test_x)

In [79]:
from sklearn.metrics import confusion_matrix ,accuracy_score ,recall_score,precision_score,f1_score

In [81]:
tab1 = confusion_matrix(sm_test_y , pred)
tab1

array([[49262,  9280],
       [ 2445,   803]], dtype=int64)

In [83]:
accuracy_score(sm_test_y , pred)*100

81.02443761126395

In [85]:
recall_score(sm_test_y , pred)*100

24.722906403940886

In [87]:
precision_score(sm_test_y , pred)*100

7.96389963304572

In [89]:
f1_score(sm_test_y , pred)*100

12.04710824394269

In [91]:
9280/ (9280 + 49262)*100

15.851867035632539

# Decision Tree

In [93]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy',min_samples_split =100,max_depth = 4)
dt.fit(sm_train_x , sm_train_y)

In [95]:
pred_dt = dt.predict(sm_test_x)
from sklearn.metrics import confusion_matrix
tab_dt = confusion_matrix(sm_test_y , pred_dt)
tab_dt

array([[48615,  9927],
       [ 1930,  1318]], dtype=int64)

In [97]:
tab_dt.diagonal().sum()/tab_dt.sum()*100

80.8108108108108

In [99]:
from sklearn.metrics import classification_report

In [101]:
print(classification_report(sm_test_y,pred_dt))

              precision    recall  f1-score   support

           0       0.96      0.83      0.89     58542
           1       0.12      0.41      0.18      3248

    accuracy                           0.81     61790
   macro avg       0.54      0.62      0.54     61790
weighted avg       0.92      0.81      0.85     61790



In [103]:
dt.feature_importances_

array([0.66528369, 0.10058355, 0.        , 0.04213684, 0.        ,
       0.0277691 , 0.        , 0.08030869, 0.01873071, 0.        ,
       0.        , 0.0205862 , 0.0060625 , 0.03853872])

In [105]:
feat_imp = pd.DataFrame()
feat_imp['Features'] = sm_train_x.columns
feat_imp['Imp'] = dt.feature_importances_

In [107]:
feat_imp.sort_values('Imp',ascending = False)

Unnamed: 0,Features,Imp
0,skill,0.665284
1,job,0.100584
7,signup_category,0.080309
3,target_group,0.042137
13,machines,0.038539
5,business_segment,0.027769
11,download_attempts,0.020586
8,first_cc_subscription_dts,0.018731
12,product_platforms,0.006062
2,purpose,0.0


In [109]:
search_dict = {"criterion" : ["gini","entropy"],
               "max_depth": range(3,10),
               "min_samples_split": (25,50,75,100,125)}

In [111]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(dt, param_grid = search_dict)

In [113]:
grid.fit(sm_train_x,sm_train_y)

In [115]:
grid.best_params_

{'criterion': 'gini', 'max_depth': 9, 'min_samples_split': 25}

In [117]:
pred_grid = grid.predict(sm_test_x)

In [119]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [121]:
tab = confusion_matrix(sm_test_y,pred_grid)
tab

array([[45578, 12964],
       [ 1621,  1627]], dtype=int64)

In [123]:
accuracy_score(sm_test_y,pred_grid)*100

76.39585693477909

In [125]:
dt.feature_importances_

array([0.66528369, 0.10058355, 0.        , 0.04213684, 0.        ,
       0.0277691 , 0.        , 0.08030869, 0.01873071, 0.        ,
       0.        , 0.0205862 , 0.0060625 , 0.03853872])

In [127]:
feat_imp = pd.DataFrame()
feat_imp['Features'] = sm_train_x.columns
feat_imp['Imp'] = dt.feature_importances_

In [129]:
feat_imp = feat_imp.sort_values('Imp',ascending = False)

In [131]:
feat_imp

Unnamed: 0,Features,Imp
0,skill,0.665284
1,job,0.100584
7,signup_category,0.080309
3,target_group,0.042137
13,machines,0.038539
5,business_segment,0.027769
11,download_attempts,0.020586
8,first_cc_subscription_dts,0.018731
12,product_platforms,0.006062
2,purpose,0.0


In [133]:
l1 = list(feat_imp[feat_imp.Imp>.01].Features)
l1

['skill',
 'job',
 'signup_category',
 'target_group',
 'machines',
 'business_segment',
 'download_attempts',
 'first_cc_subscription_dts']

In [135]:
l1.insert(0,'label')

In [137]:
l1

['label',
 'skill',
 'job',
 'signup_category',
 'target_group',
 'machines',
 'business_segment',
 'download_attempts',
 'first_cc_subscription_dts']

In [139]:
sm = pd.read_csv(r"C:\Users\Suraj\OneDrive\Desktop\Python Dataset\modeldata.csv")
sm.shape

(308948, 18)

In [141]:
sm=sm.loc[:,l1]
sm.shape

(308948, 9)

In [147]:
sm.isnull().sum()

label                        0
skill                        0
job                          0
signup_category              0
target_group                 0
machines                     0
business_segment             0
download_attempts            0
first_cc_subscription_dts    0
dtype: int64

In [149]:
sm.label.value_counts()

label
0    292369
1     16579
Name: count, dtype: int64

In [145]:
sm.business_segment.fillna('INDIVIDUAL' ,inplace = True)
sm.job.fillna('STUDENT' ,inplace = True)
sm.download_attempts.fillna(1 ,inplace = True)
sm.machines.fillna(1,inplace = True)

In [151]:
sm[sm.select_dtypes(include='object').columns]=sm[sm.select_dtypes(include='object').columns].apply(le.fit_transform)

In [152]:
# sm = sm.loc[:,l2]

In [155]:
from sklearn.model_selection import train_test_split

In [157]:
sm_train , sm_test = train_test_split(sm,test_size= .2,random_state = 450)

In [167]:
sm_train.label.value_counts()

label
0    233827
1     83324
Name: count, dtype: int64

In [169]:
df0 = sm_train[sm_train.label == 1]
sm_train = pd.concat([sm_train , df0,df0.iloc[:10000]])

In [226]:
sm_train_x = sm_train.iloc[ : ,1:: ]
sm_train_y = sm_train.iloc[ : ,0]
sm_test_x = sm_test.iloc[ : ,1::]
sm_test_y = sm_test.iloc[ : ,0]

In [228]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy',min_samples_split =25,max_depth = 9)
dt.fit(sm_train_x , sm_train_y)

In [230]:
pred_dt = dt.predict(sm_test_x)
from sklearn.metrics import confusion_matrix
tab_dt = confusion_matrix(sm_test_y , pred_dt)
tab_dt

array([[58498,    44],
       [ 3242,     6]], dtype=int64)

In [232]:
tab_dt.diagonal().sum()/tab_dt.sum()

0.9468198737659815

In [234]:
from sklearn.metrics import classification_report

In [236]:
print(classification_report(sm_test_y,pred_dt))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97     58542
           1       0.12      0.00      0.00      3248

    accuracy                           0.95     61790
   macro avg       0.53      0.50      0.49     61790
weighted avg       0.90      0.95      0.92     61790



In [None]:
# Recall:
# For class 0: 0.71
# For class 1: 0.57
# Macro average recall: 0.64
# Weighted average recall: 0.71
# Accuracy: 0.71