# Classification

# TASK 2: IMPORTING LIBRARIES & MODELS

In [None]:

# Core libraries
import pandas as pd
import numpy as np

# Algoriym libraries & modules
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics

# Visualization libraries & modules (Optional)
import pydotplus
from IPython.display import Image

# TASK 3: EXPLORING THE DATA

### Data Preparation

In [None]:
# Define input file
input_file = 'bank_marketing.csv'

# Define class labels
df_class_names = ['Deposit','Undeposit']

In [None]:
# Define a function to create our dataframe

def get_df(file):
    df = pd.read_csv(file)
    return df

# Call the function
df = get_df(input_file)
row_no,col_no = df.shape


In [None]:
df

### Question 5: Based	on	how	many	customers the	algorithm	will	be	trained	and	tested?

In [None]:
train_size = len(df[df['train'] == 1])
test_size = len(df[df['train'] == 0])

print(f"Number of customers the algorithm will be trained on: {train_size}")
print(f"Number of customers the algorithm will be tested on: {test_size}")

### Question 6: How	many	features each	customer	is	represented	by?

In [None]:
print(f"Number of features each customer is represented by: {col_no-2}")

### Question 7: Which features  are	categorized	as	predictors?

In [None]:
# Define a function that shows the features (without the Train column)
def get_feature_names(dataframe):
    names = list(dataframe.columns)
    features_names = names[:-1]
    return features_names

# Define a function that shows the predictors (without the target column)
def get_predictors_names(dataframe):
    names = list(dataframe.columns)
    predictors_names = names[:-2]
    return predictors_names

In [None]:
df_features_names = get_feature_names(df)
df_features_names

In [None]:
# Call the function
df_predictors_names = get_predictors_names(df)
print("The following features are categorized as predictors:")
for i in df_predictors_names:
    print(i)

### Question 8: Which feature is categorized as the target?


In [None]:
df_features_names[-1:]

### Question 9: build a Decision Tree model


#	TASK 4:	BUILDING	THE MODEL

#### Step 1: Saving the data in a multidimensional array

In [None]:
# Define a function that saves the data in an array
def get_nd(dataframe):
    nd = dataframe.values
    return nd

# Call the function
nd_values = get_nd(df)

# Show the values
nd_values

#### Step 2: Split array into train and test

In [None]:
# Define a function that split the array into train and test data
def get_nd_train_test(ndarray, train_index):
    nd_train = ndarray[ndarray[:,col_no-1] == 1]
    nd_test = ndarray[ndarray[:,col_no-1] == 0]
    return nd_train, nd_test

# Call the function
nd_train_data, nd_test_data = get_nd_train_test (nd_values, col_no)

# Show the train data
nd_train_data

#### Step 3: Predictors and label separations<br><br>(X - includes the predictors columns; Y - includes the label (target) column)

In [None]:
# Define a function that separates the predictors columns from the label (target) column
def get_XY(ndarray):
    X = ndarray[:,:-2]
    Y = ndarray[:,-2]
    return X, Y

# Call the function
x_train, y_train = get_XY (nd_train_data)
x_test, y_test = get_XY (nd_test_data)

#### Step 4: Build model with training data

In [None]:
# Define a function that will create a decision tree that classifies cars (characterized by the predictors) with similar label (target) (Acceptable / Unacceptable) 
def get_clf(X,Y):
    clf = DecisionTreeClassifier(criterion="gini", min_impurity_decrease = 0.00065)
    clf.fit(X, Y)
    return clf

# Call the function
dt_clf = get_clf(x_train,y_train)


#### Step 5: Make predictions for testing data

In [None]:
# Define a function that takes the trained model and test it on unseen data (i.e., test data)
def get_pred(classifier, X):
    y_pred = classifier.predict(X)
    return y_pred

# Call the function
dt_y_pred = get_pred(dt_clf, x_test)

### Visualize the Model (Optional)

In [None]:
# Define a function that transform the model results from numerical values into a graph
def get_graph (classifier, predictors_names):
    dot_data = tree.export_graphviz(classifier, out_file=None,
                                    feature_names=predictors_names, 
                                    class_names=df_class_names)
    graph = pydotplus.graph_from_dot_data(dot_data)
    return graph

# Call the function
dt_graph = get_graph(dt_clf,df_predictors_names)

In [None]:
# Define a function that saves the graph into PDF and JPEG files
def save_graph(graph):
    graph.write_pdf('bank.pdf')
    graph.write_jpg('bank.jpg')
    
# Call the function
save_graph(dt_graph)

In [None]:
# Define a function that creates an image of the decision tree graph
def get_image(graph):
    image=Image(graph.create_png())
    return image

# Call the function
graph_image = get_image(dt_graph)

# Show the image
graph_image

### Question 10: Part of the predictors	that were included in the data are not presented in	the	decision tree. Explain why (Provide	a verbal answer)

because their gini value is lower then the value of the "min_impurity_decrease" (0.00065)


### Question 11: The Gini value	shown in the first tree node of	the	decision tree is 0.209, explain	what represents this value (Provide	a verbal answer)

 Represents the probability of the 'season' predictor being classified incorrectly when
selected randomly.

### Question 12: The attribute	that was selected for the root of the tree is ‘season’. Provide	a detailed calculation of the Gini index for this attribute	(Hint: you can use Excel, Python or	perform	a manual calculation).

In [None]:
train_df = df[df['train'] == 1]
seasons =  range(1, 5)
seasons_ginis = []
for season in seasons:
    season_occurrences = train_df[train_df['season'] == season]
    total_occurrences = len(season_occurrences)
    true_occurrences = season_occurrences[season_occurrences['deposit'] == 1]
    false_occurrences = season_occurrences[season_occurrences['deposit'] == 0]
    true_ratio = len(true_occurrences) / total_occurrences
    false_ratio = len(false_occurrences) / total_occurrences
    gini = (1 - (true_ratio ** 2) - (false_ratio ** 2)) * total_occurrences / len(train_df)
    seasons_ginis.append(gini)

gini_for_seasons = np.sum(seasons_ginis)
print(f"Seasons ginis: {gini_for_seasons}")

### Question 13: Below are two new customers that will be reached byno the bank's	campaign. Based	on the decision	tree determine for each	of them	whether	it will	deposit	money or not (Provide a	verbal and detailed	answer):                   ● A customer	that is	older than 65, will	be reached by cellular phone, at the end of	a week during the Winter or	Spring.  ● A customer that is older	than 25, will be reached by	telephone, at the beginning	of a week during the Summer or Autumn.	

a. we will go through the tree and check: because it's on the Winter or Spring, we will go from the root to the left subtree. now, because the customer is older than 65 we will go to the right and we can see that he is a leaf and the prediction is 'Undeposit'.
b. we will go through the tree and check: because it's on the autumn, we will go from the root to the right subtree. now, because the part of the week in which the customer was reached by the campaign is at the beginning of the week, we will go to the left. now because the customer is 80 years old, we will go to the right twice and we can see that he is a leaf and the prediction is 'Undeposit'.

### TASK	5: Evaluate The Model

 ### Question 14: Use the Accuracy measure to evaluate the mode	you	have created in Task 4.


In [None]:
# Define a function that calculates the accuracy
def get_accuracy(test, pred):
    accuracy = metrics.accuracy_score(test,pred)
    return accuracy

# Call the function
dt_clf_accuracy = get_accuracy(y_test,dt_y_pred)

# Show the Accuracy value
dt_clf_accuracy

 ### Question 15: Describe,	in	the	context	of the model	you	have	built,	the	meaning	of	the	numerical	result you	have	received	for	the	Accuracy	measure	(Provide	a	verbal	answer	using	terms	such	as	classification	matrix,	True	Positive,	etc.)

The accuracy score represents the proportion of correct predictions made by the model. It measures how well the model performs in classifying instances correctly. A high accuracy score means that the model has made a large number of correct predictions relative to the total number of instances. It is calculated using a classification matrix that includes true positives, true negatives, false positives, and false negatives. However, it's important to consider other evaluation metrics, such as precision, recall, and F1 score, for a more comprehensive understanding of the model's performance.

 ### Question 16: Describe	one	prominent	disadvantage of	the	Accuracy	measure	you	have	calculated.

One prominent disadvantage of the accuracy measure is that it can be misleading when dealing with imbalanced datasets. An imbalanced dataset is one where the number of instances in different classes is significantly unequal. In such cases, the accuracy score may be high even if the model is performing poorly in predicting the minority class.

For example, consider a dataset with 95% instances belonging to Class A and only 5% instances belonging to Class B. A model that simply predicts all instances as Class A would achieve an accuracy of 95%. However, this high accuracy does not reflect the model's ability to correctly identify instances of Class B, which may be of greater interest or importance.

 ### Question 17: Explain	how	measures	of	Sensitivity	(TPR)	and	Specificity	(TNR)	can	be	used	practically	and	wisely	in	the	context	of	the	bank's	marketing	campaign	(Provide	a	verbal	answer,	no	need	to	calculate	the	measures).

In the context of a bank's marketing campaign, sensitivity (TPR) measures the effectiveness of targeting the desired customers who are likely to respond positively. It indicates the campaign's ability to reach and engage the target audience, maximizing the chances of attracting potential customers. Specificity (TNR) measures the efficiency of avoiding customers who are less likely to respond positively, minimizing wasteful expenditures and maintaining a positive customer experience. By considering both sensitivity and specificity, the bank can strike a balance between maximizing reach and targeting the right audience, leading to a successful marketing campaign.




