In [46]:
import pandas as pd
import numpy as np

In [47]:
data = pd.read_csv('Clustering_Output_Python.csv')
data.drop('CUST_ID',axis=1,inplace=True)

In [48]:
data.head()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,Group
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0.0,2.0,1000.0,201.802084,139.509787,0.0,12.0,Installment_Purchasers
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4.0,0.0,7000.0,4103.032597,1072.340217,0.222222,12.0,Withdrawers
2,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0.0,12.0,7500.0,622.066742,627.284787,0.0,12.0,One_Off_Purchasers
3,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1.0,1.0,7500.0,0.0,441.852935,0.0,12.0,One_Off_Purchasers
4,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0.0,1.0,1200.0,678.334763,244.791237,0.0,12.0,One_Off_Purchasers


In [49]:
#Storing features and target in X and y as numpy arrays
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [50]:
#Performing train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

**SCALING**
Using Standard Scaler instead of MinMaxScaler since I trained and tested with both and Standard Scaling yielded 
much better accuracy of prediction. The choice of scaling is still a blackbox and people don't have consensus on that. Whatever best suits your dataset.

In [51]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

***MODEL***

In [52]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=42, multi_class="multinomial", max_iter=2000)
classifier.fit(X_train,y_train)

LogisticRegression(max_iter=2000, multi_class='multinomial', random_state=42)

No encoding of target variable required as sklearn model are capable of handing multiclass classification inherently/natively and will take care of encoding internally even when we pass string labels.

In [53]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[['One_Off_Purchasers' 'One_Off_Purchasers']
 ['Big_Spenders' 'Big_Spenders']
 ['Installment_Purchasers' 'Installment_Purchasers']
 ...
 ['Big_Spenders' 'Big_Spenders']
 ['Installment_Purchasers' 'Installment_Purchasers']
 ['Big_Spenders' 'Big_Spenders']]


***MODEL EVALUATION USING ACCURACY AND CONFUSION MATRIX***

In [54]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[813   0   9   0]
 [  1 673   0   2]
 [  3   0 587   0]
 [  0   1   0 596]]


0.9940409683426443

Since a basic Logistic Regression Model yields a great accuracy of almost 100 percent. We can accept this model.

Now since we have fixed on the model parameters, we can now retrain on the whole dataset and save our model as a pickle file.

In [55]:
scaler_final = StandardScaler()
X = scaler_final.fit_transform(X)

In [56]:
classifier_final = LogisticRegression(random_state=42, multi_class="multinomial", max_iter=2000)
classifier_final.fit(X,y)

LogisticRegression(max_iter=2000, multi_class='multinomial', random_state=42)

**Saving our final scaler and model objects**

In [57]:
import joblib
joblib.dump(scaler_final,'customer_segment_pred_scaler.pkl')
joblib.dump(classifier_final,'customer_segment_pred_model.pkl')

['customer_segment_pred_model.pkl']

**Reloading model and making single prediction**

In [58]:
loaded_scaler = joblib.load("customer_segment_pred_scaler.pkl")
loaded_model = joblib.load("customer_segment_pred_model.pkl")

In [59]:
loaded_model.predict(X[1].reshape(1,17))

array(['Withdrawers'], dtype=object)

**Final Deployment Code for our predictive model**

As time did not permit preparing deployment code and heroku deployment of a flask application. 
I am preparing a .py file which will take in raw unlabelled data and based on the trained logistic regression
model , it will predict label and returns labelled data as output

In [None]:
from fancyimpute import KNN
import pandas as pd
import numpy as np

data = pd.read_csv('credit-card-data.csv')
data_original = data.copy()
data.drop('CUST_ID',axis=1,inplace=True)

#Apply KNN imputation algorithm
data = pd.DataFrame(KNN(k = 3).fit_transform(data), columns = data.columns)

X = data.iloc[:,:].values

loaded_scaler = joblib.load("customer_segment_pred_scaler.pkl")
loaded_model = joblib.load("customer_segment_pred_model.pkl")

X = loaded_scaler.transform(X)

y = loaded_model.predict(X)

# Conactenating labels
data_label=pd.concat([data_original,pd.Series(y, name = 'GROUP')],axis=1)

data_label.to_csv('data_predicted.csv', index = False)