Import Libraries


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

Import DataSet

In [2]:
# Load the Excel file
data = pd.read_excel('dataset.xlsx')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Department,Machine,Date,JR number,JR time,J start time,J finish time,Break down time,Production Delay,Repair done by,Repair done by2,Repair done by3,Remark
0,,,,NaT,,,,,,,,,,
1,,PMG,Bosch,2019-01-04,9332.0,20:30:00,20:30:00,20:50:00,00:20:00,00:20:00,Neelakumara,,,Repaired seal leak
2,,PMG,Bosch,2019-01-04,9332.0,12:45:00,12:45:00,01:10:00,,00:25:00,Neelakumara,Kariyawasam,,Repaired low speed & emty pouch
3,,Oil Filling,pump,2019-01-04,,20:00:00,20:00:00,21:00:00,01:00:00,,Neelakumara,Antony,,Replaced rubber mount (702 tank pump)
4,,PMG,M/puff,2019-01-04,10441.0,05:30:00,05:30:00,06:00:00,00:30:00,,Neelakumara,Antony,,Changed product line for m/puff


In [4]:
# Printing the shape of the dataframe
data.shape

(500, 14)

In [6]:
# Total number of columns in the dataset
data.columns

Index(['Unnamed: 0', 'Department', 'Machine', 'Date ', 'JR number', 'JR time',
       'J start time', 'J finish time ', 'Break down time ',
       'Production Delay', 'Repair done by', 'Repair done by2',
       'Repair done by3', 'Remark'],
      dtype='object')

In [8]:
# Information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Unnamed: 0        0 non-null      float64       
 1   Department        162 non-null    object        
 2   Machine           160 non-null    object        
 3   Date              162 non-null    datetime64[ns]
 4   JR number         160 non-null    float64       
 5   JR time           149 non-null    object        
 6   J start time      149 non-null    object        
 7   J finish time     149 non-null    object        
 8   Break down time   489 non-null    object        
 9   Production Delay  25 non-null     object        
 10  Repair done by    131 non-null    object        
 11  Repair done by2   152 non-null    object        
 12  Repair done by3   39 non-null     object        
 13  Remark            161 non-null    object        
dtypes: datetime64[ns](1), floa

In [10]:
# Checking if there is some null values or not
data.isnull()

Unnamed: 0.1,Unnamed: 0,Department,Machine,Date,JR number,JR time,J start time,J finish time,Break down time,Production Delay,Repair done by,Repair done by2,Repair done by3,Remark
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,False,False,False,False,False,False,False,False,False,False,True,True,False
2,True,False,False,False,False,False,False,False,True,False,False,False,True,False
3,True,False,False,False,True,False,False,False,False,True,False,False,True,False
4,True,False,False,False,False,False,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,True,True,True,True,True,True,True,True,False,True,True,True,True,True
496,True,True,True,True,True,True,True,True,False,True,True,True,True,True
497,True,True,True,True,True,True,True,True,False,True,True,True,True,True
498,True,True,True,True,True,True,True,True,False,True,True,True,True,True


In [11]:
# Checking if there is some null values or not
data.isnull().sum()

Unnamed: 0          500
Department          338
Machine             340
Date                338
JR number           340
JR time             351
J start time        351
J finish time       351
Break down time      11
Production Delay    475
Repair done by      369
Repair done by2     348
Repair done by3     461
Remark              339
dtype: int64

In [13]:
# Dropping less important features
df = data[['Department','Machine','Repair done by2']]
df.head()

Unnamed: 0,Department,Machine,Repair done by2
0,,,
1,PMG,Bosch,
2,PMG,Bosch,Kariyawasam
3,Oil Filling,pump,Antony
4,PMG,M/puff,Antony


In [14]:
# Dropping null values
df = df.dropna()

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 2 to 162
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Department       150 non-null    object
 1   Machine          150 non-null    object
 2   Repair done by2  150 non-null    object
dtypes: object(3)
memory usage: 4.7+ KB


In [82]:
X = data[['Department', 'Machine']]
Y = data['Repair done by2']
Y.unique()

array([nan, 'Kariyawasam', 'Antony', 'Chinthaka', 'Malaka', 'Anura',
       'Rohana', 'chamidu', 'Gihan', 'Sahan', 'Thilan', 'Zoysa'],
      dtype=object)

In [84]:
# Perform label encoding on the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(Y)

In [86]:
label_mapping = pd.DataFrame({'Repairer Name': Y, 'Encoded Value': y})
label_mapping = label_mapping.drop_duplicates().sort_values('Encoded Value').reset_index(drop=True)

print(label_mapping)

   Repairer Name  Encoded Value
0         Antony              0
1          Anura              1
2      Chinthaka              2
3          Gihan              3
4    Kariyawasam              4
5         Malaka              5
6         Rohana              6
7          Sahan              7
8         Thilan              8
9          Zoysa              9
10       chamidu             10
11           NaN             11


In [94]:


# Perform one-hot encoding on the features
X = pd.get_dummies(X)
training_columns = list(X.columns)


# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the classification algorithm (Decision Tree classifier in this example)
clf = DecisionTreeClassifier()

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Decode the predicted labels back to original names
y_pred_decoded = label_encoder.inverse_transform(y_pred)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.67


In [88]:
from sklearn.metrics import classification_report

# Convert target names to strings
target_names = label_encoder.classes_.astype(str)

# Calculate precision, recall, and F1-score
classification_report = classification_report(y_test, y_pred, target_names=target_names)
print(classification_report)


              precision    recall  f1-score   support

      Antony       0.00      0.00      0.00         8
       Anura       0.14      0.20      0.17         5
   Chinthaka       0.00      0.00      0.00         2
       Gihan       0.00      0.00      0.00         2
 Kariyawasam       0.00      0.00      0.00         7
      Malaka       0.00      0.00      0.00         2
      Rohana       0.00      0.00      0.00         2
       Sahan       0.00      0.00      0.00         0
      Thilan       0.00      0.00      0.00         1
       Zoysa       0.00      0.00      0.00         2
     chamidu       0.00      0.00      0.00         1
         nan       1.00      0.97      0.99        68

    accuracy                           0.67       100
   macro avg       0.10      0.10      0.10       100
weighted avg       0.69      0.67      0.68       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [95]:
def predict_repairer(department, machine, clf, label_encoder, training_columns):
    # Prepare the input data for prediction
    input_data = pd.DataFrame({'Department': [department], 'Machine': [machine]})

    # Perform one-hot encoding on the input data
    input_data_encoded = pd.get_dummies(input_data)

    # Align the input data columns with the training columns
    input_data_encoded = input_data_encoded.reindex(columns=training_columns, fill_value=0)

    # Make predictions on the input data
    predicted_label = clf.predict(input_data_encoded)

    return predicted_label[0]



In [98]:
predicted_label = predict_repairer('PMG', '501 Pump', clf, label_encoder, training_columns)
predicted_repairer = label_encoder.inverse_transform([predicted_label])[0]
print('Predicted Repairer:', predicted_repairer)


Predicted Repairer: Anura
