In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import pickle
from sklearn import set_config
set_config(display = 'diagram')

In [2]:
df = pd.read_csv("D:\ExcelR_Project\Model_Deployment\Pipeline_df.csv")
df.rename(columns={'Cust_Since(yrs)': 'Cust_Since_yrs'}, inplace=True)
df

Unnamed: 0,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,...,NumStorePurchases,NumWebVisitsMonth,Complain,Response,Age,Cust_Since_yrs,Num_Childrns,AcceptedCmp,Total_amt_spend,KM_Clusters
0,58138,58,635,88,546,172,88,88,3,8,...,4,7,0,1,67,12,0,4,1617,1
1,46344,38,11,1,6,2,1,6,2,1,...,2,5,0,0,70,10,2,0,27,0
2,71613,26,426,49,127,111,21,42,1,8,...,10,4,0,0,59,11,0,0,776,1
3,26646,26,11,4,20,10,3,5,2,2,...,4,6,0,0,40,10,1,2,53,0
4,58293,94,173,43,118,46,27,15,5,5,...,6,5,0,0,43,10,1,2,422,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2029,666666,23,9,14,18,8,1,12,4,3,...,3,6,0,0,47,11,1,2,62,0
2030,61223,46,709,43,182,42,118,247,2,9,...,4,5,0,0,57,11,1,2,1341,2
2031,56981,91,908,48,217,32,12,24,1,2,...,13,6,0,0,43,10,0,0,1241,1
2032,69245,8,428,30,214,80,30,61,2,6,...,10,3,0,0,68,10,1,2,843,1


In [12]:
for column in df.columns:
    print(f'{column} =,{df[column].min(), df[column].max()}')

Income =,(1730, 666666)
Recency =,(0, 99)
MntWines =,(0, 1493)
MntFruits =,(0, 199)
MntMeatProducts =,(0, 1725)
MntFishProducts =,(0, 259)
MntSweetProducts =,(0, 262)
MntGoldProds =,(0, 321)
NumDealsPurchases =,(0, 15)
NumWebPurchases =,(0, 27)
NumCatalogPurchases =,(0, 28)
NumStorePurchases =,(0, 13)
NumWebVisitsMonth =,(0, 20)
Complain =,(0, 1)
Response =,(0, 1)
Age =,(28, 131)
Cust_Since_yrs =,(10, 12)
Num_Childrns =,(0, 3)
AcceptedCmp =,(0, 4)
Total_amt_spend =,(5, 2525)
KM_Clusters =,(0, 2)


In [3]:
df.columns

Index(['Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts',
       'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
       'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
       'NumStorePurchases', 'NumWebVisitsMonth', 'Complain', 'Response', 'Age',
       'Cust_Since_yrs', 'Num_Childrns', 'AcceptedCmp', 'Total_amt_spend',
       'KM_Clusters'],
      dtype='object')

In [4]:
df.shape

(2034, 21)

In [5]:
x = df.iloc[:, :-1]
y = df['KM_Clusters']

In [6]:
sc = StandardScaler()
x = sc.fit_transform(x)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
lr = LogisticRegression()
lr.fit(xtrain, ytrain)
ypred = lr.predict(xtest)
print(classification_report(ytest, ypred))
print(confusion_matrix(ytest, ypred))
print(f'Training Accuracy = {lr.score(xtrain, ytrain)}')
print(f'Testing Accuracy = {lr.score(xtest, ytest)}')


              precision    recall  f1-score   support

           0       0.98      0.99      0.99       174
           1       0.96      0.96      0.96       112
           2       0.96      0.94      0.95       121

    accuracy                           0.97       407
   macro avg       0.97      0.97      0.97       407
weighted avg       0.97      0.97      0.97       407

[[173   0   1]
 [  0 108   4]
 [  3   4 114]]
Training Accuracy = 0.9827904118008605
Testing Accuracy = 0.9705159705159705


In [7]:
transformer = ColumnTransformer(transformers = [
    ('tnf1', StandardScaler(), ['Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts',
       'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
       'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
       'NumStorePurchases', 'NumWebVisitsMonth', 'Complain', 'Response', 'Age',
       'Cust_Since_yrs', 'Num_Childrns', 'AcceptedCmp', 'Total_amt_spend'])
], remainder='passthrough')

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(df.drop(columns='KM_Clusters'), df['KM_Clusters'], test_size=0.2, random_state=42)


In [9]:
model = Pipeline(steps=[('transofrmer', transformer), ('model', LogisticRegression())])
model.fit(xtrain, ytrain)
# ypred = model.predict(xtest)

In [10]:
pickle.dump(model, open('Lr_Cluster_model.pkl', 'wb'))