In [None]:
# Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.svm import LinearSVC

In [None]:
url= "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/creditcard.csv"

raw_data = pd.read_csv(url)
raw_data.head()

In [None]:
# Creating the label of the target
labels = raw_data.Class.unique()

# size of the each target
size =raw_data.Class.value_counts().values

#plotting the pie

fig, ax = plt.subplots()

ax.pie(size, labels=labels, autopct='%1.1f%%')
ax.set_title("Value of the Target classes")
plt.show()

In [None]:
correlation_values = raw_data.corr()['Class'].drop('Class')
correlation_values.plot(kind='barh', figsize=(10, 6))

In [None]:
# Standardize the features for better model performance
raw_data.iloc[:, 1:30] = StandardScaler().fit_transform(raw_data.iloc[:, 1:30])
data_matrix = raw_data.values

# X: features, we remove Time from the features
X = data_matrix[:, 1:30]

# Y: Target class
y = data_matrix[:, 30]

#Normalize the data
X = normalize(X, norm='l1' )

In [None]:
# Train and split the data

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42)

In [None]:
# Informing the model of the imabalance in the Target Feature
w_train = compute_sample_weight('balanced', y_train)

In [None]:
# Building the model
dt = DecisionTreeClassifier(max_depth=4, random_state=35)

#fitting the model
dt.fit(X_train, y_train, sample_weight=w_train)

##### Building a SVM model

In [None]:
# Build the Support Vector Machine model
svm = LinearSVC(class_weight='balanced', random_state=31, loss='hinge', fit_intercept=False)

# fitting the SVM model
svm.fit(X_train, y_train)

In [None]:
# Check the probability of the test samples that belongs to fraudulent transactions
y_pred_dt = dt.predict_proba(X_test)[:, 1]

Using these probabilities, we can evaluate the Area Under the Receiver Operating Characteristic Curve (ROC-AUC) score as a metric of model performance.
The AUC-ROC score evaluates your model's ability to distinguish positive and negative classes considering all possible probability thresholds. The higher its value, the better the model is considered for separating the two classes of values.


In [None]:
roc_auc_dt = roc_auc_score(y_test, y_pred_dt)
print("Decision Tree ROC-AUC Score : {0:.3f}".format(roc_auc_dt))

### Evaluating the Support Vector Machine

In [None]:
# compute the probability of the test sample belonging to the fraudulent transactions
y_pred_svm = svm.decision_function(X_test)

In [None]:
# Check the accuracy of the SVM on the test set in terms of the ROC-AUC Score
roc_auc_svm = roc_auc_score(y_test, y_pred_svm)
print("SVM ROC-AUC score: {0:.3f}".format(roc_auc_svm))

# TOP 6 features in correlation with the target variable


In [None]:
# print the top 6 features that correlates with the target. NB: abs removes the negative sign
correlation_values = abs(raw_data.corr()['Class'].drop('Class'))
correlation_values = correlation_values.sort_values(ascending=False)[:6]
correlation_values

In [None]:
# Building with the top 6 features
X = data_matrix[:, [3,10,12,14,16,17]]


In [None]:
# train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Balance the model with the imbalance in the target variable
w_train = compute_sample_weight('balanced', y_train)

In [None]:
# Inputting the decision tree
dt = DecisionTreeClassifier(max_depth=4, random_state=35)
dt.fit(X_train, y_train, sample_weight=w_train)

In [None]:
# checking the probability of the prediction
y_predict_ = dt.predict_proba(X_test)[:, 1]

In [None]:
# comparing the predicted with the untrained data
roc_auc_dt = roc_auc_score(y_test, y_predict_)
print("Decicision Tree ROC_AUC score is {0:.3f}".format(roc_auc_dt))

For SVM


In [None]:
svms = LinearSVC(class_weight='balanced', random_state=35, loss='hinge', fit_intercept=False)
svms.fit(X_train, y_train)

In [None]:
y_predict_svm = svms.decision_function(X_test)

In [None]:
roc_auc_svms = roc_auc_score(y_test, y_predict_svm)
print("SVM ROC-AUC score: {0:.3f}".format(roc_auc_svms))

Insights:  
1. With a larger set of features, SVM perform better in comparison to Decision Trees.
2. Decision Trees benefitted from feature selection and performed better.  
3. SVMs may require higher feature dimensionality to create efficient decision hyperplane.