# Simple model training

In this workbook, we train a very simple model that would take number of lines of code and McCabe complexity as inputs. It will then predict whether a module is defective or not.

In [4]:
# import pandas and read the chapter_16.xlsx file
import pandas as pd

df = pd.read_excel('./chapter_16.xlsx', sheet_name='camel_1_6')

df.head()

Unnamed: 0,ClassName,LOC,MCC,Defect
0,org.apache.camel.AlreadyStoppedException,1,0,0
1,org.apache.camel.AsyncCallback,16,0,1
2,org.apache.camel.AsyncProcessor,14,2,1
3,org.apache.camel.Body,0,0,0
4,org.apache.camel.CamelContext,30,13,1


In [5]:
# instantiate the Decision tree model
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

In [8]:
# split the data into train and test
from sklearn.model_selection import train_test_split

X = df.drop(['ClassName', 'Defect'], axis=1)

y = df['Defect']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)



In [9]:
# fit the model
dt.fit(X_train, y_train)

In [11]:
# evaluate the performance on the test set
from sklearn.metrics import accuracy_score

y_pred = dt.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')

Accuracy: 0.64


In [12]:
# save the model to a joblib file
from joblib import dump

dump(dt, 'dt.joblib')

['dt.joblib']

In [14]:
# now read the model from the joblib file
# and predict the defects for the X_test data
from joblib import load

dt = load('dt.joblib')

y_pred = dt.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')

Accuracy: 0.64
