In [None]:
# Importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_curve, auc
from sklearn.svm import SVC

warnings.filterwarnings('ignore')

# Setting the maximum number of displayed columns to 'None' (unlimited)
pd.set_option("display.max_columns", None)

# Setting the maximum number of displayed rows to 'None' (unlimited)
pd.set_option("display.max_row", None)

In [None]:
# Reading clean_openFDA file
df_openFDA = pd.read_csv('clean_openFDA.csv')

df_openFDA.head()

In [None]:
# Checking dataframe structure
df_openFDA.info()

In [None]:
# Checking number of columns and rows
df_openFDA.shape

In [None]:
# Dropping column "Unnamed: 0"
df_openFDA = df_openFDA.drop(['Unnamed: 0', 'state', 'cleaned_reason_for_recall', 'cleaned_product_description', 'cleaned_action'], axis=1)

# Checking dataframe again
df_openFDA.head()

## Data Preparation

- Obejct to numerical columns
- OneHotEncoding & labelEncoding

In [None]:
df_openFDA.info()

In [None]:
# Define the values to be removed
values_to_remove = ['U', 'N', 'f']

# Filter the DataFrame to exclude rows with these values
df_openFDA = df_openFDA[~df_openFDA['openfda.device_class'].isin(values_to_remove)]

In [None]:
df_openFDA['openfda.device_class'].value_counts()

In [None]:
df_openFDA.head()

In [None]:
# Lets decide and perfom OneHotEncoding on columns

RCD_dum = pd.get_dummies(df_openFDA['root_cause_description'], prefix= 'RCD', dtype=int)
Distribution_dum = pd.get_dummies(df_openFDA['distribution'], prefix= 'distribution', dtype=int)
EMP_dum = pd.get_dummies(df_openFDA['event_month_posted'], prefix= 'EMP', dtype=int)
EYP_dum = pd.get_dummies(df_openFDA['event_year_posted'], prefix= 'EYP', dtype=int)

DOW_dum = pd.get_dummies(df_openFDA['day_of_week_posted'], prefix= 'EYP', dtype=int)
EMT_dum = pd.get_dummies(df_openFDA['event_month_terminated'], prefix= 'EYP', dtype=int)
EYT_dum = pd.get_dummies(df_openFDA['event_year_terminated'], prefix= 'EYP', dtype=int)
DOWT_dum = pd.get_dummies(df_openFDA['day_of_week_terminated'], prefix= 'EYP', dtype=int)


df_openFDA = pd.concat([df_openFDA, 
                        RCD_dum, 
                        Distribution_dum, 
                        EMP_dum, 
                        EYP_dum, 
                        DOW_dum, 
                        EMT_dum, 
                        EYT_dum, 
                        DOWT_dum], axis = 1)

In [None]:
df_openFDA.head()

In [None]:
df_openFDA = df_openFDA.drop(['root_cause_description',
                             'event_month_posted', 
                             'event_year_posted', 
                             'day_of_week_posted', 
                             'event_month_terminated', 
                             'event_year_terminated', 
                             'day_of_week_terminated',
                             'distribution'], axis=1)

df_openFDA.head(20)

## Naive Bayes classifier

In [None]:
# Define features and target variable
X = df_openFDA.drop('openfda.device_class', axis=1)
y = df_openFDA['openfda.device_class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
print(f'X_train shape {X_train.shape}')
print(f'y_train shape {y_train.shape}')
print(f'X_test shape {X_test.shape}')
print(f'y_test shape {y_test.shape}')

In [None]:
X_train.head(10)

In [None]:
X_test.head(10)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB

from sklearn.utils.class_weight import compute_sample_weight

weights = compute_sample_weight(class_weight='balanced', y=y_train)

multi_NB = MultinomialNB()

multi_NB.fit(X_train, y_train, sample_weight=weights)

In [None]:
y_pred = multi_NB.predict(X_test)

y_pred

In [None]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
y_pred_train = multi_NB.predict(X_train)

y_pred_train

In [None]:
import numpy as np

unique_values, counts = np.unique(y_pred_train, return_counts=True)

# Create a dictionary with the unique values and their counts
value_counts = dict(zip(unique_values, counts))

# Print the value counts
for value, count in value_counts.items():
    print(f'Value: {value}, Count: {count}')

In [None]:
# Count the occurrences of each class
class_counts = df_openFDA['openfda.device_class'].value_counts()

# Extract class labels and their respective counts
class_labels = class_counts.index
counts = class_counts.values

# Define a color palette for the bars (you can customize the colors)
colors = ['skyblue', 'salmon', 'lightgreen']

# Create a bar chart to visualize the class distribution with colors
plt.figure(figsize=(8, 6))
plt.bar(class_labels, counts, color=colors)
plt.xlabel('Class Labels')
plt.ylabel('Count')
plt.title('Class Distribution')
plt.xticks(rotation=0)  # Rotate x-axis labels for better readability
plt.show()


In [None]:
# print the scores on training and test set

print('Training set score: {:.4f}'.format(multi_NB.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(multi_NB.score(X_test, y_test)))

In [None]:
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

cm

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
# Import necessary libraries
import numpy as np
from sklearn.metrics import confusion_matrix

# Define the number of classes (replace this with your number of classes)
num_classes = 3

# Initialize dictionaries to store TP, TN, FP, FN for each class
class_1 = {}
class_2 = {}
class_3 = {}

# Calculate the confusion matrix
confusion_matrix = confusion_matrix(y_test, y_pred)

for class_idx in range(num_classes):
    # Extract the relevant row and column for the current class
    TP_value = confusion_matrix[class_idx, class_idx]
    FP_value = np.sum(confusion_matrix[:, class_idx]) - TP_value
    FN_value = np.sum(confusion_matrix[class_idx, :]) - TP_value
    TN_value = np.sum(confusion_matrix) - (TP_value + FP_value + FN_value)

    # Append the calculated values to the respective class dictionaries
    if class_idx == 0:
        class_1["TP"] = TP_value
        class_1["TN"] = TN_value
        class_1["FP"] = FP_value
        class_1["FN"] = FN_value
    elif class_idx == 1:
        class_2["TP"] = TP_value
        class_2["TN"] = TN_value
        class_2["FP"] = FP_value
        class_2["FN"] = FN_value
    elif class_idx == 2:
        class_3["TP"] = TP_value
        class_3["TN"] = TN_value
        class_3["FP"] = FP_value
        class_3["FN"] = FN_value

# Now, you have dictionaries class_1, class_2, and class_3 with key-value pairs
print("Class 1:")
print(class_1)

print("Class 2:")
print(class_2)

print("Class 3:")
print(class_3)


In [None]:
class_1_cm = class_1['TP'] + class_1['TN'] + class_1['FP'] + class_1['FN']
class_2_cm = class_2['TP'] + class_2['TN'] + class_2['FP'] + class_2['FN']
class_3_cm =  class_3['TP'] + class_3['TN'] + class_3['FP'] + class_3['FN']

# Calculate classification accuracy
accuracy = (class_1['TP'] + class_2['TP'] + class_3['TP']) / (class_1_cm + class_1_cm + class_1_cm)

# Calculate classification error
error = 1 - accuracy

# Calculate precision, recall, true positive rate, false positive rate, and specificity for each class
def calculate_metrics(class_dict):
    precision = class_dict['TP'] / (class_dict['TP'] + class_dict['FP'])
    recall = class_dict['TP'] / (class_dict['TP'] + class_dict['FN'])
    true_positive_rate = recall
    false_positive_rate = class_dict['FP'] / (class_dict['TN'] + class_dict['FP'])
    specificity = class_dict['TN'] / (class_dict['TN'] + class_dict['FP'])
    return precision, recall, true_positive_rate, false_positive_rate, specificity

class_1_metrics = calculate_metrics(class_1)
class_2_metrics = calculate_metrics(class_2)
class_3_metrics = calculate_metrics(class_3)

# Print the calculated metrics
print("Classification Accuracy:", accuracy)
print()
print("Classification Error:", error)
print()
print(f"Class 1 Metrics \n Precision = {class_1_metrics[0]} \n Recall = {class_1_metrics[1]} \n True Positive Rate = {class_1_metrics[2]} \n False Positive Rate = {class_1_metrics[3]} \n Specificity = {class_1_metrics[4]} \n")
print()
print(f"Class 2 Metrics \n Precision = {class_2_metrics[0]} \n Recall = {class_2_metrics[1]} \n True Positive Rate = {class_2_metrics[2]} \n False Positive Rate = {class_2_metrics[3]} \n Specificity = {class_2_metrics[4]} \n")
print()
print(f"Class 3 Metrics \n Precision = {class_3_metrics[0]} \n Recall = {class_3_metrics[1]} \n True Positive Rate = {class_3_metrics[2]} \n False Positive Rate = {class_3_metrics[3]} \n Specificity = {class_3_metrics[4]} \n")


## Decision Tree Classifier

In [None]:
# Define features and target variable
X = df_openFDA.drop('openfda.device_class', axis=1)
y = df_openFDA['openfda.device_class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(f'X_train shape {X_train.shape}')
print(f'y_train shape {y_train.shape}')
print(f'X_test shape {X_test.shape}')
print(f'y_test shape {y_test.shape}')

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)

In [None]:
#%pip install pydotplus
#%pip install python-graphviz

In [None]:
# Importing required packages for visualization
from IPython.display import Image  
from six import StringIO
from sklearn.tree import export_graphviz
import pydotplus
import graphviz

In [None]:
# plotting tree with max_depth=3
dot_data = StringIO()  

export_graphviz(dt, out_file=dot_data, filled=True, rounded=False,
                feature_names=X.columns, 
                class_names=['1', '2', '3'])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

In [None]:
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(accuracy_score(y_train, y_train_pred))
confusion_matrix(y_train, y_train_pred)

In [None]:
print(accuracy_score(y_test, y_test_pred))
confusion_matrix(y_test, y_test_pred)

In [None]:
def get_dt_graph(dt_classifier):
    dot_data = StringIO()  

    export_graphviz(dt_classifier, out_file=dot_data, filled=True, rounded=False,
                feature_names=X.columns, 
                class_names=['1', '2', '3'])
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    return graph

In [None]:
def evaluate_model(dt_classifier):
    y_train_pred = dt_classifier.predict(X_train)
    y_test_pred = dt_classifier.predict(X_test)
    
    print("Train Set Performance")
    print(accuracy_score(y_train, y_train_pred))
    print(confusion_matrix(y_train, y_train_pred))
    print("--"*25)
    print("Test Set Performance")
    print(accuracy_score(y_test, y_test_pred))
    print(confusion_matrix(y_test, y_test_pred))

In [None]:
gph = get_dt_graph(dt)
Image(gph.create_png())

In [None]:
dt_depth = DecisionTreeClassifier(max_depth = 4, random_state = 42)
dt_depth.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_depth)
Image(gph.create_png())

In [None]:
evaluate_model(dt_depth)

In [None]:
dt_min_split = DecisionTreeClassifier(min_samples_split =10, random_state = 42)
dt_min_split.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_min_split)
Image(gph.create_png(), width=1000, height=800)

In [None]:
evaluate_model(dt_min_split)

In [None]:
dt_min_leaf = DecisionTreeClassifier(min_samples_leaf = 5, random_state = 42)
dt_min_leaf.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_min_leaf)
Image(gph.create_png(), width=1000, height=1000)

In [None]:
evaluate_model(dt_min_leaf)

In [None]:
dt_min_leaf_entropy = DecisionTreeClassifier(min_samples_leaf = 20, random_state = 42, criterion = "entropy")
dt_min_leaf_entropy.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_min_leaf_entropy)
Image(gph.create_png())

In [None]:
evaluate_model(dt_min_leaf_entropy)

In [None]:
dt = DecisionTreeClassifier(random_state = 42)

In [None]:
params = {
    "max_depth" : [2,3,5],
    "min_samples_leaf" : [5,8,12,15],
    "criterion" : ['gini', 'entropy']    
}

In [None]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = dt,
            param_grid = params,
            cv = 4,
            n_jobs = -1,
            verbose = 1,
            scoring = 'accuracy')

In [None]:
%%time
grid_search.fit(X_train, y_train)

In [None]:
cv_df = pd.DataFrame(grid_search.cv_results_)
cv_df.head()

In [None]:
cv_df.shape

In [None]:
cv_df.nlargest(5, 'mean_test_score')

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_

In [None]:
dt_best = grid_search.best_estimator_

In [None]:
evaluate_model(dt_best)

In [None]:
gph=get_dt_graph(dt_best)
Image(gph.create_png())

In [None]:
y_train_pred = dt_best.predict(X_train)
y_test_pred = dt_best.predict(X_test)

In [None]:
print("Train Set Performance")
print(accuracy_score(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))
print("--"*25)
print("Test Set Performance")
print(accuracy_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

In [None]:
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

cm

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
# Import necessary libraries
import numpy as np
from sklearn.metrics import confusion_matrix

# Define the number of classes (replace this with your number of classes)
num_classes = 3

# Initialize dictionaries to store TP, TN, FP, FN for each class
class_1 = {}
class_2 = {}
class_3 = {}

# Calculate the confusion matrix
confusion_matrix = confusion_matrix(y_test, y_pred)

for class_idx in range(num_classes):
    # Extract the relevant row and column for the current class
    TP_value = confusion_matrix[class_idx, class_idx]
    FP_value = np.sum(confusion_matrix[:, class_idx]) - TP_value
    FN_value = np.sum(confusion_matrix[class_idx, :]) - TP_value
    TN_value = np.sum(confusion_matrix) - (TP_value + FP_value + FN_value)

    # Append the calculated values to the respective class dictionaries
    if class_idx == 0:
        class_1["TP"] = TP_value
        class_1["TN"] = TN_value
        class_1["FP"] = FP_value
        class_1["FN"] = FN_value
    elif class_idx == 1:
        class_2["TP"] = TP_value
        class_2["TN"] = TN_value
        class_2["FP"] = FP_value
        class_2["FN"] = FN_value
    elif class_idx == 2:
        class_3["TP"] = TP_value
        class_3["TN"] = TN_value
        class_3["FP"] = FP_value
        class_3["FN"] = FN_value

# Now, you have dictionaries class_1, class_2, and class_3 with key-value pairs
print("Class 1:")
print(class_1)

print("Class 2:")
print(class_2)

print("Class 3:")
print(class_3)


In [None]:
print('Training set score: {:.4f}'.format(dt_best.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(dt_best.score(X_test, y_test)))

# Calculate classification accuracy
accuracy = (class_1['TP'] + class_2['TP'] + class_3['TP']) / (class_1['TP'] + class_1['TN'] + class_1['FP'] + class_1['FN'] +
                                                            class_2['TP'] + class_2['TN'] + class_2['FP'] + class_2['FN'] +
                                                            class_3['TP'] + class_3['TN'] + class_3['FP'] + class_3['FN'])

# Calculate classification error
error = 1 - accuracy

# Calculate precision, recall, true positive rate, false positive rate, and specificity for each class
def calculate_metrics(class_dict):
    precision = class_dict['TP'] / (class_dict['TP'] + class_dict['FP'])
    recall = class_dict['TP'] / (class_dict['TP'] + class_dict['FN'])
    true_positive_rate = recall
    false_positive_rate = class_dict['FP'] / (class_dict['TN'] + class_dict['FP'])
    specificity = class_dict['TN'] / (class_dict['TN'] + class_dict['FP'])
    return precision, recall, true_positive_rate, false_positive_rate, specificity

class_1_metrics = calculate_metrics(class_1)
class_2_metrics = calculate_metrics(class_2)
class_3_metrics = calculate_metrics(class_3)

# Print the calculated metrics
print("Classification Accuracy:", accuracy)
print()
print("Classification Error:", error)
print()
print(f"Class 1 Metrics \n Precision = {class_1_metrics[0]} \n Recall = {class_1_metrics[1]} \n True Positive Rate = {class_1_metrics[2]} \n False Positive Rate = {class_1_metrics[3]} \n Specificity = {class_1_metrics[4]} \n")
print()
print(f"Class 2 Metrics \n Precision = {class_2_metrics[0]} \n Recall = {class_2_metrics[1]} \n True Positive Rate = {class_2_metrics[2]} \n False Positive Rate = {class_2_metrics[3]} \n Specificity = {class_2_metrics[4]} \n")
print()
print(f"Class 3 Metrics \n Precision = {class_3_metrics[0]} \n Recall = {class_3_metrics[1]} \n True Positive Rate = {class_3_metrics[2]} \n False Positive Rate = {class_3_metrics[3]} \n Specificity = {class_3_metrics[4]} \n")


## Support Vector Machines

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# 2D point
point_2d = np.array([2, 3])

# Polynomial kernel transformation
r = 1
d = 2
point_transformed = np.array([1, point_2d[0], point_2d[1], point_2d[0]**2, point_2d[0]*point_2d[1], point_2d[1]**2])

# Plot original and transformed points
fig = plt.figure(figsize=(10, 5))

# Original 2D point
ax1 = fig.add_subplot(121)
ax1.scatter(point_2d[0], point_2d[1], c='blue', marker='o', label='Original 2D Point')
ax1.set_title('Original 2D Point')
ax1.set_xlabel('X1')
ax1.set_ylabel('X2')
ax1.legend()

# Transformed 6D point
ax2 = fig.add_subplot(122, projection='3d')
ax2.scatter(point_transformed[1], point_transformed[4], point_transformed[2],
            c='red', marker='o', label='Transformed 6D Point')
ax2.set_title('Transformed 6D Point')
ax2.set_xlabel('X1')
ax2.set_ylabel('X1 * X2')
ax2.set_zlabel('X2')
ax2.legend()

plt.tight_layout()
plt.show()


In [None]:
# Use a random subset of 1000 rows (adjust if needed)
subset_size = 1000
data_subset = df_openFDA.sample(n=subset_size, random_state=42)

# Extract features and labels from the subset
X_subset = data_subset.drop('openfda.device_class', axis=1)
y_subset = data_subset['openfda.device_class']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset, test_size=0.2, random_state=42)

# Standardize features (important for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM modeling with different kernels and costs
kernels = ['linear', 'poly', 'rbf']
costs = [0.1, 1, 10]

# Create subplots for confusion matrices
fig, axes = plt.subplots(nrows=len(kernels), ncols=len(costs), figsize=(15, 12))

for i, kernel in enumerate(kernels):
    for j, cost in enumerate(costs):
        model = SVC(kernel=kernel, C=cost, probability=True)
        model.fit(X_train_scaled, y_train)
        
        # Predictions
        y_pred = model.predict(X_test_scaled)

        # Evaluate and print results
        print(f"Kernel: {kernel}, Cost: {cost}")
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("Accuracy:", accuracy_score(y_test, y_pred))
        
        # Visualization: Confusion Matrix Heatmap
        classes = sorted(y_test.unique())  # Ensure the order of classes
        cm = confusion_matrix(y_test, y_pred, labels=classes)
        
        # Plot confusion matrix heatmap
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes, ax=axes[i, j])
        axes[i, j].set_title(f"Kernel: {kernel}, Cost: {cost}")
        axes[i, j].set_xlabel("Predicted Label")
        axes[i, j].set_ylabel("True Label")

        plt.tight_layout()

plt.show()

# Print accuracies for all combinations
print("\nAccuracies:")
for i, kernel in enumerate(kernels):
    for j, cost in enumerate(costs):
        model = SVC(kernel=kernel, C=cost, probability=True)
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        print(f"Kernel: {kernel}, Cost: {cost}, Accuracy: {accuracy_score(y_test, y_pred)}")
