# Dataset Preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install category_encoders

In [None]:
# Import the packages

import pandas as pd
import numpy as np
import itertools

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import matplotlib.gridspec as gridspec
import seaborn as sns
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
from pylab import rcParams
rcParams['figure.figsize'] = 12, 8
import os

# Any results you write to the current directory are saved as output.

In [None]:
# load the dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/Project/train - train.csv.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/Project/test - test.csv.csv')

#Check number of rows and columns in the dataset
print("The dataset has %d rows and %d columns." % df.shape)

In [None]:
df.describe()
df_test.describe()

In [None]:
df.head(10)
df_test.head(10)

**Finding Missing Values**

In [None]:
missing_values = df[pd.isnull(df).any(axis=1)]
missing_values

missing_values = df_test[pd.isnull(df_test).any(axis=1)]
missing_values

### Removing unnecessary columns

In [None]:
df = df.drop(["ID", "Customer_ID", "Month", "Name", "SSN"], axis=1)
df_test = df_test.drop(["ID", "Customer_ID", "Month", "Name", "SSN"], axis=1)


In [None]:
import pandas as pd

# Assuming df is your DataFrame
unique_values_per_column = {}

for column in df.columns:
    unique_values = df[column].unique()
    unique_values_per_column[column] = unique_values

# Display the unique values for each column
for column, values in unique_values_per_column.items():
    print(f"Column: {column}")
    print(f"Unique Values: {values}")
    print()

In [None]:
df = df[(df['Occupation'] != '_______')]
df = df[(df['Credit_Mix'] != '_')]
df = df[(df['Payment_of_Min_Amount'] != 'NM')]
df = df[(df['Payment_Behaviour'] != '!@9#%8')]


### Category Encoding

In [None]:
# 1 = POOR, 2 = Standard and 3 = GOOD
df["Credit_Score"] = df["Credit_Score"].apply(lambda x: 0 if x=="Poor" else (1 if x=="Standard" else 2))

In [None]:
import category_encoders as ce

In [None]:
encoder = ce.OrdinalEncoder(cols=["Occupation", "Num_Bank_Accounts", "Num_Credit_Card", "Num_of_Loan", "Type_of_Loan", "Num_of_Delayed_Payment", "Num_Credit_Inquiries", "Credit_Mix", "Credit_History_Age", "Payment_of_Min_Amount", "Payment_Behaviour"])

df = encoder.fit_transform(df)

In [None]:
df.head(10)
df.info()

In [None]:
encoder = ce.OrdinalEncoder(cols=["Occupation","Num_Bank_Accounts", "Num_Credit_Card", "Num_of_Loan", "Type_of_Loan", "Num_of_Delayed_Payment", "Num_Credit_Inquiries", "Credit_Mix", "Credit_History_Age", "Payment_of_Min_Amount", "Payment_Behaviour"])

df_test = encoder.fit_transform(df_test)

In [None]:
df_test.info()

In [None]:
# check missing values in variables

df.isnull().sum()
print()
df_test.isnull().sum()

In [None]:
df.replace('_', np.nan, inplace=True)
df_test.replace('_', np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df_test = df_test.apply(pd.to_numeric, errors='coerce')

df = df.fillna(df.mean())
df_test = df_test.fillna(df_test.mean())

### Removing Outliers

In [None]:
import pandas as pd

# Assuming df is your DataFrame
# Specify the factor for IQR (e.g., 1.5)
iqr_factor = 1.5

# Compute the first quartile (Q1) and third quartile (Q3)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)

# Calculate the IQR for each column
IQR = Q3 - Q1

# Create a boolean mask for outliers
outliers_mask = (df < (Q1 - iqr_factor * IQR)) | (df > (Q3 + iqr_factor * IQR))

# Replace outliers with median values column-wise
df = df.where(~outliers_mask, df.median(axis=0), axis=1)

# Display the resulting DataFrame with outliers replaced by median values
print(df)

# Neural Network

In [None]:
from random import seed
from random import randrange
from random import random
from csv import reader
from math import exp

In [None]:
# Initialize a network
def initialize_network(n_inputs, n_hidden, n_outputs):
	network = list()
	hidden_layer = [{'weights':[random() for i in range(n_inputs + 1)]} for i in range(n_hidden)]
	network.append(hidden_layer)
	output_layer = [{'weights':[random() for i in range(n_hidden + 1)]} for i in range(n_outputs)]
	network.append(output_layer)
	return network

In [None]:
# Update network weights with error
def update_weights(network, row, l_rate):
	for i in range(len(network)):
		inputs = row[:-1]
		if i != 0:
			inputs = [neuron['output'] for neuron in network[i - 1]]
		for neuron in network[i]:
			for j in range(len(inputs)):
				neuron['weights'][j] -= l_rate * neuron['delta'] * inputs[j]
			neuron['weights'][-1] -= l_rate * neuron['delta']

**Sigmoid Activation Function**

In [None]:
# Calculate neuron activation for an input
def activate(weights, inputs):
	activation = weights[-1]
	for i in range(len(weights)-1):
		activation += weights[i] * inputs[i]
	return activation

# Transfer neuron activation
def transfer(activation):
	return 1.0 / (1.0 + exp(-activation))

**Back propagation**

In [None]:
# Train a network for a fixed number of epochs
def train_network(network, train, l_rate, n_epoch, n_outputs):
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            outputs = forward_propagate(network, row)
            expected = [0 for i in range(n_outputs)]
            expected[int(row[-1])] = 1
            sum_error += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])
            backward_propagate_error(network, expected)
            update_weights(network, row, l_rate)
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))

**Gradient Descent Method**

In [None]:
# Forward propagate input to a network output
def forward_propagate(network, row):
	inputs = row
	for layer in network:
		new_inputs = []
		for neuron in layer:
			activation = activate(neuron['weights'], inputs)
			neuron['output'] = transfer(activation)
			new_inputs.append(neuron['output'])
		inputs = new_inputs
	return inputs

In [None]:
# Train a network for a fixed number of epochs
def train_network(network, train, l_rate, n_epoch, n_outputs):
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            outputs = forward_propagate(network, row)
            expected = [0 for i in range(n_outputs)]
            expected[row[-1]] = 1
            sum_error += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])
            backward_propagate_error(network, expected)
            update_weights(network, row, l_rate)
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))

**Back Propagation Rules**

In [None]:
# Backpropagate error and store in neurons
def backward_propagate_error(network, expected):
	for i in reversed(range(len(network))):
		layer = network[i]
		errors = list()
		if i != len(network)-1:
			for j in range(len(layer)):
				error = 0.0
				for neuron in network[i + 1]:
					error += (neuron['weights'][j] * neuron['delta'])
				errors.append(error)
		else:
			for j in range(len(layer)):
				neuron = layer[j]
				errors.append(neuron['output'] - expected[j])
		for j in range(len(layer)):
			neuron = layer[j]
			neuron['delta'] = errors[j] * transfer_derivative(neuron['output'])


In [None]:
# Backpropagation Algorithm With Stochastic Gradient Descent
def back_propagation(train, test, l_rate, n_epoch, n_hidden):
	n_inputs = len(train[0]) - 1
	n_outputs = len(set([row[-1] for row in train]))
	network = initialize_network(n_inputs, n_hidden, n_outputs)
	train_network(network, train, l_rate, n_epoch, n_outputs)
	predictions = list()
	for row in test:
		prediction = predict(network, row)
		predictions.append(prediction)
	return(predictions)

In [None]:
# Calculate the derivative of an neuron output
def transfer_derivative(output):
	return output * (1.0 - output)

**Termination Criteria**

In [None]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

In [None]:
# Make a prediction with a network
def predict(network, row):
	outputs = forward_propagate(network, row)
	return outputs.index(max(outputs))

**Learning Rate**

In [None]:
dataset = df.values.tolist()

In [None]:
# Converting target variable values into integer from float
dataset = [[int(value) if i == len(row) - 1 else value for i, value in enumerate(row)] for row in dataset]

In [None]:
dataset

In [None]:
# Find the min and max values for each column
def dataset_minmax(dataset):
	minmax = list()
	stats = [[min(column), max(column)] for column in zip(*dataset)]
	return stats

# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
	for row in dataset:
		for i in range(len(row)-1):
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Rescale dataset columns to the range 0-1
def normalize_test_dataset(dataset, minmax):
	for row in dataset:
		for i in range(len(row)):
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

In [None]:
# normalize input variables
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)

In [None]:
n_folds = 5
l_rate = 0.1
n_epoch = 20
n_hidden = 21
scores = evaluate_algorithm(dataset, back_propagation, n_folds, l_rate, n_epoch, n_hidden)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))