# Preferred Payment Method Prediction


### Reg No: IT21134180
### Name: Vihansa S.A.S

<hr/>

<ul>
    <li><b>Target Variable:</b> Payment (Cash, Credit card, Ewallet) </li>
    <li><b>Predictors:</b> Branch, Customer type, Gender, Product line, Quantity, Date, Time, COGS, and Gross income</li>
    <li><b>Objective:</b> Predict the preferred payment method for customers based on their purchase behavior, allowing the business to tailor promotions or offers to encourage the use of specific payment methods.</li>
</ul>




In [104]:
#Importing required libraries
import seaborn as sns
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.svm import NuSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import OneHotEncoder

from datetime import datetime

In [105]:
#Import the dataset 
dataset = pd.read_csv('../dataset/supermarket_sales.csv')

In [106]:
# Visualise the dataset 
dataset

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.8200,80.2200,3/8/2019,10:29,Cash,76.40,4.761905,3.8200,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.2880,489.0480,1/27/2019,20:33,Ewallet,465.76,4.761905,23.2880,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,233-67-5758,C,Naypyitaw,Normal,Male,Health and beauty,40.35,1,2.0175,42.3675,1/29/2019,13:46,Ewallet,40.35,4.761905,2.0175,6.2
996,303-96-2227,B,Mandalay,Normal,Female,Home and lifestyle,97.38,10,48.6900,1022.4900,3/2/2019,17:16,Ewallet,973.80,4.761905,48.6900,4.4
997,727-02-1313,A,Yangon,Member,Male,Food and beverages,31.84,1,1.5920,33.4320,2/9/2019,13:22,Cash,31.84,4.761905,1.5920,7.7
998,347-56-2442,A,Yangon,Normal,Male,Home and lifestyle,65.82,1,3.2910,69.1110,2/22/2019,15:33,Cash,65.82,4.761905,3.2910,4.1


In [107]:
#Select columns in the dataset 
dataset = dataset[["Branch","Customer type","Gender","Product line","Date","Time","Quantity","cogs","gross income","Payment"]]



In [108]:
dataset 

Unnamed: 0,Branch,Customer type,Gender,Product line,Date,Time,Quantity,cogs,gross income,Payment
0,A,Member,Female,Health and beauty,1/5/2019,13:08,7,522.83,26.1415,Ewallet
1,C,Normal,Female,Electronic accessories,3/8/2019,10:29,5,76.40,3.8200,Cash
2,A,Normal,Male,Home and lifestyle,3/3/2019,13:23,7,324.31,16.2155,Credit card
3,A,Member,Male,Health and beauty,1/27/2019,20:33,8,465.76,23.2880,Ewallet
4,A,Normal,Male,Sports and travel,2/8/2019,10:37,7,604.17,30.2085,Ewallet
...,...,...,...,...,...,...,...,...,...,...
995,C,Normal,Male,Health and beauty,1/29/2019,13:46,1,40.35,2.0175,Ewallet
996,B,Normal,Female,Home and lifestyle,3/2/2019,17:16,10,973.80,48.6900,Ewallet
997,A,Member,Male,Food and beverages,2/9/2019,13:22,1,31.84,1.5920,Cash
998,A,Normal,Male,Home and lifestyle,2/22/2019,15:33,1,65.82,3.2910,Cash


In [109]:
#Checking for Missing values 
dataset.isnull() #It is difficult to identify missing values using this method since the dataset is large.
dataset.isnull().values.any() #Using this command we can scan the entire dataset and get a verification whether there are missing values or not 

False

In [110]:
#Encoding the data into numerical values [Label Encoding]

#Getting the count of labels available in the relevant columns 
dataset["Branch"].value_counts()

Branch
A    340
B    332
C    328
Name: count, dtype: int64

In [111]:
#Convert Date time which had the data type object as date time
dataset.loc[:, 'Date'] = pd.to_datetime(dataset['Date'], format='%m/%d/%Y')
dataset.loc[:, 'Time'] = pd.to_datetime(dataset['Time'], format='%H:%M')
dataset

Unnamed: 0,Branch,Customer type,Gender,Product line,Date,Time,Quantity,cogs,gross income,Payment
0,A,Member,Female,Health and beauty,2019-01-05 00:00:00,1900-01-01 13:08:00,7,522.83,26.1415,Ewallet
1,C,Normal,Female,Electronic accessories,2019-03-08 00:00:00,1900-01-01 10:29:00,5,76.40,3.8200,Cash
2,A,Normal,Male,Home and lifestyle,2019-03-03 00:00:00,1900-01-01 13:23:00,7,324.31,16.2155,Credit card
3,A,Member,Male,Health and beauty,2019-01-27 00:00:00,1900-01-01 20:33:00,8,465.76,23.2880,Ewallet
4,A,Normal,Male,Sports and travel,2019-02-08 00:00:00,1900-01-01 10:37:00,7,604.17,30.2085,Ewallet
...,...,...,...,...,...,...,...,...,...,...
995,C,Normal,Male,Health and beauty,2019-01-29 00:00:00,1900-01-01 13:46:00,1,40.35,2.0175,Ewallet
996,B,Normal,Female,Home and lifestyle,2019-03-02 00:00:00,1900-01-01 17:16:00,10,973.80,48.6900,Ewallet
997,A,Member,Male,Food and beverages,2019-02-09 00:00:00,1900-01-01 13:22:00,1,31.84,1.5920,Cash
998,A,Normal,Male,Home and lifestyle,2019-02-22 00:00:00,1900-01-01 15:33:00,1,65.82,3.2910,Cash


In [112]:
#Print the data types of the dataset
print(dataset.dtypes)

Branch            object
Customer type     object
Gender            object
Product line      object
Date              object
Time              object
Quantity           int64
cogs             float64
gross income     float64
Payment           object
dtype: object


In [113]:
# Extract year from 'Date' or 'Time' column (replace with your column name)
dataset['Year'] = pd.to_datetime(dataset['Date']).dt.year
dataset['Year'] = pd.to_datetime(dataset['Date']).dt.year  # Assuming 'Date' holds datetime data

# One-hot encode the 'Year' feature
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
year_encoded = encoder.fit_transform(dataset[['Year']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Year'] = pd.to_datetime(dataset['Date']).dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Year'] = pd.to_datetime(dataset['Date']).dt.year  # Assuming 'Date' holds datetime data


In [114]:
# Create a pandas DataFrame from the data
df = pd.DataFrame(dataset)

# Get the number of labels (unique categories) for each column
num_labels = df.nunique()
print(num_labels)

Branch             3
Customer type      2
Gender             2
Product line       6
Date              89
Time             506
Quantity          10
cogs             990
gross income     990
Payment            3
Year               1
dtype: int64


In [115]:
# Initialise label encoder
label_encoder = LabelEncoder() 
  
# Encode labels in the data frame. 
df['Branch']= label_encoder.fit_transform(df['Branch']) 
df['Customer type']= label_encoder.fit_transform(df['Customer type']) 
df['Gender']= label_encoder.fit_transform(df['Gender']) 
df['Product line']= label_encoder.fit_transform(df['Product line']) 
df['Payment']= label_encoder.fit_transform(df['Payment'])   

#Print the data frame
df

Unnamed: 0,Branch,Customer type,Gender,Product line,Date,Time,Quantity,cogs,gross income,Payment,Year
0,0,0,0,3,2019-01-05 00:00:00,1900-01-01 13:08:00,7,522.83,26.1415,2,2019
1,2,1,0,0,2019-03-08 00:00:00,1900-01-01 10:29:00,5,76.40,3.8200,0,2019
2,0,1,1,4,2019-03-03 00:00:00,1900-01-01 13:23:00,7,324.31,16.2155,1,2019
3,0,0,1,3,2019-01-27 00:00:00,1900-01-01 20:33:00,8,465.76,23.2880,2,2019
4,0,1,1,5,2019-02-08 00:00:00,1900-01-01 10:37:00,7,604.17,30.2085,2,2019
...,...,...,...,...,...,...,...,...,...,...,...
995,2,1,1,3,2019-01-29 00:00:00,1900-01-01 13:46:00,1,40.35,2.0175,2,2019
996,1,1,0,4,2019-03-02 00:00:00,1900-01-01 17:16:00,10,973.80,48.6900,2,2019
997,0,0,1,2,2019-02-09 00:00:00,1900-01-01 13:22:00,1,31.84,1.5920,0,2019
998,0,1,1,4,2019-02-22 00:00:00,1900-01-01 15:33:00,1,65.82,3.2910,0,2019


In [116]:
# Assuming df['cogs'] contains your data
cogs_array = np.array(df['cogs'])  # Convert to numpy array if not already
gross_array = np.array(df['gross income'])

# Reshape the array to 2D array
cogs_array_reshaped = cogs_array.reshape(-1, 1)
grossincome_array_reshaped = gross_array.reshape(-1, 1)

# Initialize the MinMaxScaler
MinMax = MinMaxScaler()

# Fit and transform the reshaped array
df['cogs'] = MinMax.fit_transform(cogs_array_reshaped)
df['gross income'] = MinMax.fit_transform(grossincome_array_reshaped)

df


Unnamed: 0,Branch,Customer type,Gender,Product line,Date,Time,Quantity,cogs,gross income,Payment,Year
0,0,0,0,3,2019-01-05 00:00:00,1900-01-01 13:08:00,7,0.521616,0.521616,2,2019
1,2,1,0,0,2019-03-08 00:00:00,1900-01-01 10:29:00,5,0.067387,0.067387,0,2019
2,0,1,1,4,2019-03-03 00:00:00,1900-01-01 13:23:00,7,0.319628,0.319628,1,2019
3,0,0,1,3,2019-01-27 00:00:00,1900-01-01 20:33:00,8,0.463549,0.463549,2,2019
4,0,1,1,5,2019-02-08 00:00:00,1900-01-01 10:37:00,7,0.604377,0.604377,2,2019
...,...,...,...,...,...,...,...,...,...,...,...
995,2,1,1,3,2019-01-29 00:00:00,1900-01-01 13:46:00,1,0.030707,0.030707,2,2019
996,1,1,0,4,2019-03-02 00:00:00,1900-01-01 17:16:00,10,0.980465,0.980465,2,2019
997,0,0,1,2,2019-02-09 00:00:00,1900-01-01 13:22:00,1,0.022049,0.022049,0,2019
998,0,1,1,4,2019-02-22 00:00:00,1900-01-01 15:33:00,1,0.056622,0.056622,0,2019


In [117]:
#Data Visualisation in a plot 
# payment_mapping = {0: 1, 1: 2, 2: 3}  # Update with your actual payment methods
# df['Payment_Code'] = df['Payment'].map(payment_mapping)

# # Plotting with matplotlib
# plt.figure(figsize=(8, 6))
# plt.scatter(df.index, df['Payment_Code'], marker='o')
# plt.title('Payment Distribution')
# plt.xlabel('Index')
# plt.ylabel('Payment Method')
# plt.yticks(list(payment_mapping.values()), list(payment_mapping.keys()))# Setting y-ticks labels
# plt.show()

# sns.PairGrid(df)
# g = sns.PairGrid(df)
# g.map(plt.scatter)
# sns.pairplot(df,hue='Payment',palette='rainbow')

In [118]:
#Split the dataset as X and Y 
x = df.drop(['Payment'],axis=1)
y = df['Payment'] #value that is expected to predict 

#Print X and Y 
print(x)
print (y)



     Branch  Customer type  Gender  Product line                 Date  \
0         0              0       0             3  2019-01-05 00:00:00   
1         2              1       0             0  2019-03-08 00:00:00   
2         0              1       1             4  2019-03-03 00:00:00   
3         0              0       1             3  2019-01-27 00:00:00   
4         0              1       1             5  2019-02-08 00:00:00   
..      ...            ...     ...           ...                  ...   
995       2              1       1             3  2019-01-29 00:00:00   
996       1              1       0             4  2019-03-02 00:00:00   
997       0              0       1             2  2019-02-09 00:00:00   
998       0              1       1             4  2019-02-22 00:00:00   
999       0              0       0             1  2019-02-18 00:00:00   

                    Time  Quantity      cogs  gross income  Year  
0    1900-01-01 13:08:00         7  0.521616      0.5216

In [119]:
#Split train and test dataset 
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2,train_size=0.8,random_state=1,shuffle=False)
print('x train')
print(x_train)

print('X test')
print (x_test)

print('y train')
print(y_train)

print('y test')
print (y_test)


x train
     Branch  Customer type  Gender  Product line                 Date  \
0         0              0       0             3  2019-01-05 00:00:00   
1         2              1       0             0  2019-03-08 00:00:00   
2         0              1       1             4  2019-03-03 00:00:00   
3         0              0       1             3  2019-01-27 00:00:00   
4         0              1       1             5  2019-02-08 00:00:00   
..      ...            ...     ...           ...                  ...   
795       1              1       1             1  2019-03-15 00:00:00   
796       2              0       0             4  2019-01-24 00:00:00   
797       0              0       0             5  2019-02-22 00:00:00   
798       1              1       1             3  2019-03-15 00:00:00   
799       2              0       1             4  2019-02-11 00:00:00   

                    Time  Quantity      cogs  gross income  Year  
0    1900-01-01 13:08:00         7  0.521616    

In [120]:
# define models and parameters

grid['max_depth'] = range(2, 10)
model = DecisionTreeClassifier()
criterion = ['gini', 'entropy', 'log_loss']
splitter = ['best','random']
#max_depth = [0, 1]
min_samples_split = [2,4,6,8,50]

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score='raise')
grid_result = grid_search.fit(x_train, np.ravel(y_train))  # Ensure y_train is flat (1D)

# define grid search
grid = dict(criterion=criterion,splitter=splitter,max_depth=max_depth,min_samples_split=min_samples_split)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
#grid_result = grid_search.fit(x_train, y_train)

print(grid_result.best_params_)

TypeError: float() argument must be a string or a real number, not 'Timestamp'

In [None]:
#Train the model 
clf = DecisionTreeClassifier(criterion='entropy',splitter='best',max_depth=1,min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)

clf.fit(x_train,y_train)

In [None]:
#Trained model prediction and accuracy 
prediction = clf.predict(x_test)
accuracy = accuracy_score(y_test,prediction)
accuracy 

In [None]:
#Train dataset based on Gaussian naive bayes algorithm 
gnb = GaussianNB()
gnb.fit(x_train, y_train)
# making predictions on the testing set
y_pred = gnb.predict(x_test)
#comparing actual responses with predicted response value 
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

In [None]:
#Support Vector Machines : SVC algorithm 
clf = make_pipeline(StandardScaler(), SVC(C=1.0,gamma='auto',decision_function_shape='ovo'))
clf.fit(x_train, y_train)
# pipeline(steps=[('standardscaler', StandardScaler()),
#                 ('svc', SVC(gamma='auto'))])
prediction = clf.predict(x_test)
accuracy = accuracy_score(y_test,prediction)
accuracy

In [None]:
#Support Vector Machines : NuSVC algorithm 
clf = make_pipeline(StandardScaler(), NuSVC(nu=0.5, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None))
clf.fit(x_train, y_train)
# pipeline(steps=[('standardscaler', StandardScaler()),
#                 ('svc', SVC(gamma='auto'))])
prediction = clf.predict(x_test)
accuracy = accuracy_score(y_test,prediction)
accuracy

In [None]:
#Random Forest Algorithm 
clf = RandomForestClassifier(n_estimators=10,max_depth=2, random_state=0).fit(x_train, y_train)
clf.score(x_train, y_train)

In [None]:
#RidgeClassifier 
clf = RidgeClassifier(alpha=1,copy_X=False,solver='sag').fit(x_train, y_train)
clf.score(x_train, y_train)

In [None]:
#BaggingClassifier
# x_train, y_train = make_classification(n_samples=1000, n_features=7,n_informative=2, n_redundant=0,random_state=0, shuffle=False)
clf = BaggingClassifier(estimator=SVC(),n_estimators=10, random_state=0).fit(x_train, y_train)
clf.score(x_train, y_train)