# Preferred Payment Method Prediction


### Reg No: IT21134180
### Name: Vihansa S.A.S

<hr/>

<ul>
    <li><b>Target Variable:</b> Payment (Cash, Credit card, Ewallet) </li>
    <li><b>Predictors:</b> Branch, Customer type, Gender, Product line, Quantity, Date, Time, COGS, and Gross income</li>
    <li><b>Objective:</b> Predict the preferred payment method for customers based on their purchase behavior, allowing the business to tailor promotions or offers to encourage the use of specific payment methods.</li>
</ul>




In [None]:
#Importing required libraries
import seaborn as sns
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.svm import NuSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import OneHotEncoder

from datetime import datetime

In [None]:
#Import the dataset 
dataset = pd.read_csv('../dataset/supermarket_sales.csv')

In [None]:
# Visualise the dataset 
dataset

In [None]:
#Select columns in the dataset 
dataset = dataset[["Branch","Customer type","Gender","Product line","Time","Quantity","cogs","gross income","Payment"]]



In [None]:
dataset 

In [None]:
#Checking for Missing values 
dataset.isnull() #It is difficult to identify missing values using this method since the dataset is large.
dataset.isnull().values.any() #Using this command we can scan the entire dataset and get a verification whether there are missing values or not 

In [None]:
#Encoding the data into numerical values [Label Encoding]

#Getting the count of labels available in the relevant columns 
dataset["Branch"].value_counts()

In [None]:
dataset.loc[:, 'Time'] = pd.to_datetime(dataset['Time'], format='%H:%M')

# Convert timestamp column to datetime type
dataset['Time'] = pd.to_datetime(dataset['Time'])

# Extract relevant time components
dataset['year'] = dataset['Time'].dt.year
dataset['month'] = dataset['Time'].dt.month
dataset['day'] = dataset['Time'].dt.day
dataset['hour'] = dataset['Time'].dt.hour
dataset['minute'] = dataset['Time'].dt.minute
dataset['second'] = dataset['Time'].dt.second

# Encode cyclical features
dataset['hour_sin'] = np.sin(2 * np.pi * dataset['hour'] / 24)
dataset['hour_cos'] = np.cos(2 * np.pi * dataset['hour'] / 24)


# Drop original timestamp column
dataset.drop('Time', axis=1, inplace=True)

dataset

In [None]:
# Create a pandas DataFrame from the data
df = pd.DataFrame(dataset)

# Get the number of labels (unique categories) for each column
num_labels = df.nunique()
print(num_labels)

In [None]:
# Initialise label encoder
label_encoder = LabelEncoder() 
  
# Encode labels in the data frame. 
df['Branch']= label_encoder.fit_transform(df['Branch']) 
df['Customer type']= label_encoder.fit_transform(df['Customer type']) 
df['Gender']= label_encoder.fit_transform(df['Gender']) 
df['Product line']= label_encoder.fit_transform(df['Product line']) 
df['Payment']= label_encoder.fit_transform(df['Payment'])   

#Print the data frame
df

In [None]:
# Assuming df['cogs'] contains your data
cogs_array = np.array(df['cogs'])  # Convert to numpy array if not already
gross_array = np.array(df['gross income'])

# Reshape the array to 2D array
cogs_array_reshaped = cogs_array.reshape(-1, 1)
grossincome_array_reshaped = gross_array.reshape(-1, 1)

# Initialize the MinMaxScaler
MinMax = MinMaxScaler()

# Fit and transform the reshaped array
df['cogs'] = MinMax.fit_transform(cogs_array_reshaped)
df['gross income'] = MinMax.fit_transform(grossincome_array_reshaped)

df


In [None]:
#Print the data types of the dataset
print(dataset.dtypes)

In [None]:
#Split the dataset as X and Y 
x = df.drop(['Payment'],axis=1)
y = df['Payment'] #value that is expected to predict 

#Print X and Y 
print(x)
print (y)



In [None]:
#Split train and test dataset 
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2,train_size=0.8,random_state=1,shuffle=False)
print('x train')
print(x_train)

print('X test')
print (x_test)

print('y train')
print(y_train)

print('y test')
print (y_test)


In [None]:
# Define models and parameters
grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': range(2, 10),
    'min_samples_split': [2, 4, 6, 8, 50],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2'],
    'max_leaf_nodes': [None, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.1, 0.2],
    'ccp_alpha': [0.0, 0.1, 0.2],
    'class_weight': [None, 'balanced']
}

model = DecisionTreeClassifier()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score='raise')
grid_result = grid_search.fit(x_train, np.ravel(y_train))  # Ensure y_train is flat (1D)

print(grid_result.best_params_)

In [None]:
#Train the model 
clf = DecisionTreeClassifier(criterion='log_loss',splitter='random',max_depth=7,min_samples_split=4, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)

clf.fit(x_train,y_train)

In [None]:
#Trained model prediction and accuracy 
prediction = clf.predict(x_test)
accuracy = accuracy_score(y_test,prediction)
accuracy 

In [None]:
#Train dataset based on Gaussian naive bayes algorithm 
gnb = GaussianNB()
gnb.fit(x_train, y_train)
# making predictions on the testing set
y_pred = gnb.predict(x_test)
#comparing actual responses with predicted response value 
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

In [None]:
#Support Vector Machines : SVC algorithm 
clf = make_pipeline(StandardScaler(), SVC(C=1.0,gamma='auto',decision_function_shape='ovo'))
clf.fit(x_train, y_train)
# pipeline(steps=[('standardscaler', StandardScaler()),
#                 ('svc', SVC(gamma='auto'))])
prediction = clf.predict(x_test)
accuracy = accuracy_score(y_test,prediction)
accuracy

In [None]:
#Support Vector Machines : NuSVC algorithm 
clf = make_pipeline(StandardScaler(), NuSVC(nu=0.5, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None))
clf.fit(x_train, y_train)
# pipeline(steps=[('standardscaler', StandardScaler()),
#                 ('svc', SVC(gamma='auto'))])
prediction = clf.predict(x_test)
accuracy = accuracy_score(y_test,prediction)
accuracy

In [None]:
#Random Forest Algorithm 
clf = RandomForestClassifier(n_estimators=10,max_depth=2, random_state=0).fit(x_train, y_train)
clf.score(x_train, y_train)

In [None]:
#RidgeClassifier 
clf = RidgeClassifier(alpha=1,copy_X=False,solver='sag').fit(x_train, y_train)
clf.score(x_train, y_train)

In [None]:
#BaggingClassifier
# x_train, y_train = make_classification(n_samples=1000, n_features=7,n_informative=2, n_redundant=0,random_state=0, shuffle=False)
clf = BaggingClassifier(estimator=SVC(),n_estimators=10, random_state=0).fit(x_train, y_train)
clf.score(x_train, y_train)