In [6]:
# Decision Tree Classification
# Credit: https://medium.com/@shuv.sdr/decision-tree-classification-in-python-b1e59205949c  
# Dataset: https://www.kaggle.com/datasets/rakeshrau/social-network-ads


## **Step 1: Load the Dataset and Libraries**

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
#from google.colab import files
#files.upload()

df_net = pd.read_csv('https://www.kaggle.com/datasets/rakeshrau/social-network-ads')
#df_net = pd.read_csv('/content/Social_Network_Ads.csv')
df_net.head()

## Step 2: Exploratory Data Analysis (EDA)
#Exploring data, preprocessing, and visualization




ParserError: Error tokenizing data. C error: Expected 1 fields in line 9, saw 2


Step 2: Exploratory Data Analysis (EDA)
Exploring data, preprocessing, and visualization

In [None]:
# User ID's are useless, so we can drop them
df_net.drop(columns = ['User ID'], inplace=True)
df_net.head()


In [None]:
# Describe data
df_net.describe()

In [None]:
# Salary distribution
sns.histplot(df_net['EstimatedSalary'])


In [None]:
# Label encoding
le = LabelEncoder()
df_net['Gender']= le.fit_transform(df_net['Gender'])


In [None]:
# Correlation matrix
df_net.corr()
sns.heatmap(df_net.corr(), annot=True)


In [None]:
#We see that gender is not correlated to the other attributes, so let's drop it

# drop gender column, as it was found to be not correlated
df_net.drop(columns=['Gender'], inplace=True)
df_net.head()

In [None]:
## Step 3: Split the data for our model

# Split data into dependent/independent variables
# Age and EstimatedSalary are independent variables
# Purchased is the dependent variable
X = df_net.iloc[:, :-1].values # [rows, cols]
y = df_net.iloc[:, -1].values

In [None]:
# Split data into test/train set
'''ORDER MATTERS!!!!'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1) # 25% of data is test, 75% is training



In [None]:
## Step 4: Feature Scaling
Normalizing the range of the features to improve the performance of our ML model
*   Normalization scales the values in the range from 0 to 1
*   Standardization transforms the values so their mean is 0 and their standard deviation is 1

In [None]:


# transform data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [None]:
## Step 5: Train model

**We will be using a classifier model since our data is not continuous and our outcomes are "yes" or "no"**

classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
# Visualize prediction results on training set
X_set, y_set = sc.inverse_transform(X_train), y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 1),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 1))
plt.contourf(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(['red', 'green']))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(['red', 'green'])(i), label = j)
plt.title('Training set')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

In [None]:
# Visualize prediction results on test set
X_set, y_set = sc.inverse_transform(X_test), y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 1),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 1))
plt.contourf(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(['red', 'green']))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(['red', 'green'])(i), label = j)
plt.title('Test set')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

In [None]:
## Step 7: Evaluating the Model

# Accuracy on our test set
accuracy_score(y_test, y_pred)
(y_test == y_pred).mean()

In [None]:
Accuracy is useful but it is important to look at other metrics to better assess! Here are 2 other metrics:


1.   Confusion Matrix
2.   Precision-Recall curve

Confusion Matrix summarizes the performance of our classifer (# of correct and incorrect predictions). Desired results: high TP and TN rates, low FP and FN rates

Precision-Recall curves use two metrics, precision and recall.  
Precision measures the accuracy of positive predictions.   
Formula: $\frac{\text{number of true positives}}{\text{sum of true positive and false positive predictions}}$  
 Recall measures the ability of the model to find all positives instances.  
Formula: $\frac{\text{number of true positive predictions}}{\text{sum of true positive and false negative predictions}}$ 

In [None]:
# Confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)


In [None]:
# Plot Precision-Recall Curve
y_pred_proba = classifier.predict_proba(X_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

fig, ax = plt.subplots(figsize=(6,6))
ax.plot(recall, precision, label='Decision Tree Classification', color = 'firebrick')
ax.set_title('Precision-Recall Curve')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
plt.box(False)
ax.legend();