# Classificatie modellen

In [None]:
import pandas as pd
import pyodbc
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Connectie maken met de database

In [None]:
# database name
DB = {
    'servername': '(local)\\SQLEXPRESS',
    'database': 'DEDS_DataWarehouse'}

In [None]:
export_conn = pyodbc.connect('DRIVER={SQL Server};SERVER=' + DB['servername'] + ';DATABASE=' + DB['database'] + ';Trusted_Connection=yes')


# Create a cursor from the connection
export_cursor = export_conn.cursor()

# check if connection is successful, else throw an error
if export_conn:
    print("Connection with database is established")
else:
    print("Connection with database is not established")
    raise Exception("Connection with database is not established")

# Tabellen inlezen

In [None]:
returned_item_query = "SELECT * FROM Returned_item WHERE CURRENT_VALUE = 1"

returned_item_result = export_cursor.execute(returned_item_query)
returned_item_fetch = returned_item_result.fetchall()
returned_item_columns = [column[0] for column in returned_item_result.description]
returned_item = pd.DataFrame.from_records(returned_item_fetch, columns=returned_item_columns)

# Dropping current_value and last_updated columns
returned_item = returned_item.drop(columns=['CURRENT_VALUE', 'LAST_UPDATED'])

returned_item

In [None]:
order_details_query = "SELECT * FROM Order_details WHERE CURRENT_VALUE = 1"

order_details_result = export_cursor.execute(order_details_query)
order_details_fetch = order_details_result.fetchall()
order_details_columns = [column[0] for column in order_details_result.description]
order_details = pd.DataFrame.from_records(order_details_fetch, columns=order_details_columns)

# dropping current value and last updated columns
order_details = order_details.drop(columns=['CURRENT_VALUE', 'LAST_UPDATED'])

order_details

In [None]:
unit_query = "SELECT * FROM Unit WHERE CURRENT_VALUE = 1"

unit_result = export_cursor.execute(unit_query)
unit_fetch = unit_result.fetchall()
unit_columns = [column[0] for column in unit_result.description]
unit = pd.DataFrame.from_records(unit_fetch, columns=unit_columns)

# dropping current value and last updated columns
unit = unit.drop(columns=['CURRENT_VALUE', 'LAST_UPDATED'])

unit

# Database connectie sluiten

In [None]:
export_cursor.close()
export_conn.close()

# Data samenvoegen

In [None]:
data = pd.merge(unit, order_details, left_on='UNIT_SK', right_on='ORDER_DETAILS_UNIT_ID_unit')
data = pd.merge(returned_item, data, how='right', left_on='RETURNED_ITEM_ORDER_DETAIL_CODE', right_on='ORDER_DETAILS_SK')

data

# Data voorbereiden

In [None]:
# removing the sk columns
sk_columns = data.filter(like='SK').columns
data.drop(columns=sk_columns, inplace=True)

#removing unnecessary columns
drop_columns = ['RETURNED_ITEM_RETURN_REASON_description_en', 'ORDER_DETAILS_UNIT_ID_unit', 'RETURNED_ITEM_code', 'ORDER_DETAILS_code', 'ORDER_DETAILS_ORDER_NUMBER_order', 'RETURNED_ITEM_ORDER_DETAIL_CODE', 'RETURNED_ITEM_DATE', 'RETURNED_ITEM_QUANTITY', 'RETURNED_ITEM_RETURNED_ITEMS_TOTAL_PRICE', 'UNIT_id', 'ORDER_DETAILS_TOTAL_COST_total', 'ORDER_DETAILS_TOTAL_MARGIN_margin']
data.drop(columns=drop_columns, inplace=True)

data

In [None]:
# converting the columns to the correct datatypes
data['RETURNED_ITEM_RETURN_REASON_code'] = data['RETURNED_ITEM_RETURN_REASON_code'].fillna(-1).astype('category')
data['UNIT_COST_cost'] = data['UNIT_COST_cost'].astype('float')
data['UNIT_PRICE_price'] = data['UNIT_PRICE_price'].astype('float')
data['UNIT_SALE_sale'] = data['UNIT_SALE_sale'].astype('float')
data['ORDER_DETAILS_QUANTITY_quantity'] = data['ORDER_DETAILS_QUANTITY_quantity'].astype('int')
data['ORDER_DETAILS_PRODUCT_NUMBER_product'] = data['ORDER_DETAILS_PRODUCT_NUMBER_product'].astype('int')

data.dtypes

# Classificatie model trainen

In [None]:
# Splitting the data
X = data.drop(columns=['RETURNED_ITEM_RETURN_REASON_code'])
y = data['RETURNED_ITEM_RETURN_REASON_code']

# splitting with a seed value, so it is always the same
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Training the model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

In [None]:
# Convert column names to strings
X.columns = X.columns.astype(str)

# Convert target classes to string representation
class_names = [str(cls) for cls in model.classes_]

# Plot the decision tree
plt.figure(figsize=(100,100))  # Set the figure size
plot_tree(model, filled=True, feature_names=X.columns, class_names=class_names)

# Save the plot as SVG file
plt.savefig('decision_tree.svg', format='svg')  # Save as SVG format

In [None]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
sse = mse * len(y_test)

print(f'Accuracy: {accuracy}')
print(f'Mean Squared Error: {mse}')
print(f'Sum of Squared Errors: {sse}')

# Confusion matrix

In [None]:
# Importing the confusion matrix
from sklearn.metrics import confusion_matrix

# Creating the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Extract TP, TN, FP, FN
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]

print("True Positives:", TP)
print("True Negatives:", TN)
print("False Positives:", FP)
print("False Negatives:", FN)