In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.linear_model import LogisticRegression

In [3]:
data = pd.read_excel("a1_Dataset_10Percent.xlsx")

In [4]:
data

Unnamed: 0,ID,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,17147654,5.0,,,,,,Tin,0.01,5.0,0
1,8415498,15.0,,,M,,,Gold,8000.00,5.0,1
2,12107603,,,,M,Midlands,East,Tin,0.01,,1
3,14400995,8.0,28.0,,F,,,Tin,0.01,,1
4,28724674,14.0,67.0,,,,,Tin,0.01,7.0,0
...,...,...,...,...,...,...,...,...,...,...,...
22218,52830893,13.0,49.0,B,M,South East,London,Silver,500.00,9.0,0
22219,52834058,13.0,65.0,B,F,South East,London,Silver,1500.00,5.0,0
22220,52834376,15.0,73.0,D,U,South East,S & S East,Gold,6053.06,12.0,0
22221,52837057,9.0,70.0,B,F,North,Yorkshire,Gold,6000.00,5.0,0


## Data Preparation

In [5]:
data = data.drop(['ID'],axis = 1)

In [6]:
data.isna().sum()

DemAffl            1085
DemAge             1508
DemClusterGroup     674
DemGender          2512
DemReg              465
DemTVReg            465
LoyalClass            0
LoyalSpend            0
LoyalTime           281
TargetBuy             0
dtype: int64

### Imputing NA values

In [7]:
data['DemAffl'] = data['DemAffl'].fillna(data['DemAffl'].mode()[0])
data['DemAge'] = data['DemAge'].fillna(data['DemAge'].mode()[0])
data['DemClusterGroup'] = data['DemClusterGroup'].fillna(data['DemClusterGroup'].mode()[0])
data['DemGender'] = data['DemGender'].fillna(data['DemGender'].mode()[0])
data['DemReg'] = data['DemReg'].fillna(data['DemReg'].mode()[0])
data['DemTVReg'] = data['DemTVReg'].fillna(data['DemTVReg'].mode()[0])
data['LoyalTime'] = data['LoyalTime'].fillna(data['LoyalTime'].mean())


In [8]:
data

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,5.0,51.0,C,F,South East,London,Tin,0.01,5.00000,0
1,15.0,51.0,C,M,South East,London,Gold,8000.00,5.00000,1
2,8.0,51.0,C,M,Midlands,East,Tin,0.01,6.56467,1
3,8.0,28.0,C,F,South East,London,Tin,0.01,6.56467,1
4,14.0,67.0,C,F,South East,London,Tin,0.01,7.00000,0
...,...,...,...,...,...,...,...,...,...,...
22218,13.0,49.0,B,M,South East,London,Silver,500.00,9.00000,0
22219,13.0,65.0,B,F,South East,London,Silver,1500.00,5.00000,0
22220,15.0,73.0,D,U,South East,S & S East,Gold,6053.06,12.00000,0
22221,9.0,70.0,B,F,North,Yorkshire,Gold,6000.00,5.00000,0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   DemAffl          22223 non-null  float64
 1   DemAge           22223 non-null  float64
 2   DemClusterGroup  22223 non-null  object 
 3   DemGender        22223 non-null  object 
 4   DemReg           22223 non-null  object 
 5   DemTVReg         22223 non-null  object 
 6   LoyalClass       22223 non-null  object 
 7   LoyalSpend       22223 non-null  float64
 8   LoyalTime        22223 non-null  float64
 9   TargetBuy        22223 non-null  int64  
dtypes: float64(4), int64(1), object(5)
memory usage: 1.7+ MB


## Label Encoding the Categorical Variables

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [11]:
cat_var = ['DemClusterGroup','DemGender','DemReg','DemTVReg','LoyalClass']
for i in cat_var:
    print(i)
    data[i]=le.fit_transform(data[i].astype('str'))

DemClusterGroup
DemGender
DemReg
DemTVReg
LoyalClass


In [12]:
data

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,5.0,51.0,2,0,3,3,3,0.01,5.00000,0
1,15.0,51.0,2,1,3,3,0,8000.00,5.00000,1
2,8.0,51.0,2,1,0,2,3,0.01,6.56467,1
3,8.0,28.0,2,0,3,3,3,0.01,6.56467,1
4,14.0,67.0,2,0,3,3,3,0.01,7.00000,0
...,...,...,...,...,...,...,...,...,...,...
22218,13.0,49.0,1,1,3,3,2,500.00,9.00000,0
22219,13.0,65.0,1,0,3,3,2,1500.00,5.00000,0
22220,15.0,73.0,3,2,3,8,0,6053.06,12.00000,0
22221,9.0,70.0,1,0,1,12,0,6000.00,5.00000,0


## Checking for MultiCollinearity

In [13]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def vif_cal(z):
    vif = pd.DataFrame()
    vif['variables'] = z.columns
    vif['VIF'] = [variance_inflation_factor(z.values,i) for i in range(z.shape[1])]
    
    return (vif)

In [14]:
z = data.iloc[:,0:9]
z

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime
0,5.0,51.0,2,0,3,3,3,0.01,5.00000
1,15.0,51.0,2,1,3,3,0,8000.00,5.00000
2,8.0,51.0,2,1,0,2,3,0.01,6.56467
3,8.0,28.0,2,0,3,3,3,0.01,6.56467
4,14.0,67.0,2,0,3,3,3,0.01,7.00000
...,...,...,...,...,...,...,...,...,...
22218,13.0,49.0,1,1,3,3,2,500.00,9.00000
22219,13.0,65.0,1,0,3,3,2,1500.00,5.00000
22220,15.0,73.0,3,2,3,8,0,6053.06,12.00000
22221,9.0,70.0,1,0,1,12,0,6000.00,5.00000


In [15]:
vif_cal(z)

Unnamed: 0,variables,VIF
0,DemAffl,6.27863
1,DemAge,10.734656
2,DemClusterGroup,3.659632
3,DemGender,1.435472
4,DemReg,2.474645
5,DemTVReg,3.752279
6,LoyalClass,3.851766
7,LoyalSpend,1.863196
8,LoyalTime,3.153032


## Variable

In [16]:
x = data.iloc[:,0:9].values
y = data.iloc[:,9].values

In [17]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)

In [18]:
classifier = LogisticRegression(max_iter = 200)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

## Saving model

In [19]:
import joblib
joblib.dump(classifier,'./c2_Classifier_LoyalCustomers')

['./c2_Classifier_LoyalCustomers']

In [20]:
confusion_matrix(y_test,y_pred)

array([[3191,  176],
       [ 688,  390]])

In [21]:
accuracy_score(y_test,y_pred)

0.8056242969628796

In [22]:
X_test
predictions = classifier.predict_proba(X_test)

In [23]:
predictions


array([[0.86209536, 0.13790464],
       [0.64489359, 0.35510641],
       [0.48639035, 0.51360965],
       ...,
       [0.88574602, 0.11425398],
       [0.90523574, 0.09476426],
       [0.873809  , 0.126191  ]])

# Prediction on New Data

In [24]:
data = pd.read_excel("a2_Dataset_90Percent.xlsx")

In [25]:
data['DemAffl'] = data['DemAffl'].fillna(data['DemAffl'].mode()[0])
data['DemAge'] = data['DemAge'].fillna(data['DemAge'].mode()[0])
data['DemClusterGroup'] = data['DemClusterGroup'].fillna(data['DemClusterGroup'].mode()[0])
data['DemGender'] = data['DemGender'].fillna(data['DemGender'].mode()[0])
data['DemReg'] = data['DemReg'].fillna(data['DemReg'].mode()[0])
data['DemTVReg'] = data['DemTVReg'].fillna(data['DemTVReg'].mode()[0])
data['LoyalTime'] = data['LoyalTime'].fillna(data['LoyalTime'].mean())


In [26]:
cat_var = ['DemClusterGroup','DemGender','DemReg','DemTVReg','LoyalClass']
for i in cat_var:
    print(i)
    data[i]=le.fit_transform(data[i].astype('str'))

DemClusterGroup
DemGender
DemReg
DemTVReg
LoyalClass


In [27]:
data.head()

Unnamed: 0,ID,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime
0,140,10.0,76.0,2,2,0,11,0,16000.0,4.0
1,620,4.0,49.0,3,2,0,11,0,6000.0,5.0
2,868,5.0,70.0,3,0,0,11,2,0.02,8.0
3,1120,10.0,65.0,5,1,0,4,3,0.01,7.0
4,2313,11.0,68.0,0,0,0,4,3,0.01,8.0


In [32]:
X_new = data.iloc[:,1:10]

In [33]:
X_new


Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime
0,10.0,76.0,2,2,0,11,0,16000.00,4.0
1,4.0,49.0,3,2,0,11,0,6000.00,5.0
2,5.0,70.0,3,0,0,11,2,0.02,8.0
3,10.0,65.0,5,1,0,4,3,0.01,7.0
4,11.0,68.0,0,0,0,4,3,0.01,8.0
...,...,...,...,...,...,...,...,...,...
994,6.0,51.0,4,0,0,2,3,0.01,8.0
995,9.0,57.0,4,1,3,3,2,3000.00,1.0
996,4.0,77.0,4,0,3,3,0,6035.46,9.0
997,8.0,53.0,3,0,0,4,3,0.01,5.0


# Importing Model

In [34]:
import joblib
classifier = joblib.load('c2_Classifier_LoyalCustomers')

In [35]:
y_pred = classifier.predict(X_new)



In [36]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,

In [37]:
pred_prob = classifier.predict_proba(X_new)



In [38]:
pred_prob

array([[0.97526212, 0.02473788],
       [0.96861531, 0.03138469],
       [0.94438621, 0.05561379],
       ...,
       [0.96587463, 0.03412537],
       [0.76745656, 0.23254344],
       [0.49863356, 0.50136644]])

In [44]:
df_pred = pd.DataFrame(pred_prob, columns=['prob_0','prob_1'])
data_with_pred = pd.concat([data,df_pred],axis=1)
data_with_pred.to_excel("Buy_Prob_90Percent.xlsx")

In [45]:
df_pred

Unnamed: 0,prob_0,prob_1
0,0.975262,0.024738
1,0.968615,0.031385
2,0.944386,0.055614
3,0.889889,0.110111
4,0.804733,0.195267
...,...,...
994,0.814631,0.185369
995,0.878997,0.121003
996,0.965875,0.034125
997,0.767457,0.232543
