<a href="https://colab.research.google.com/github/SPraveen69/Data_Model/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Import the Lib**

In [498]:
import pandas as pd
from google.colab import drive
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

### **Uploading the Dataset**

In [499]:
# Mount Google Drive
drive.mount('/content/drive')

# Set the path to the CSV file in your Google Drive
file_path = '/content/drive/MyDrive/BDX Dataset/medical center details.xlsx'

# Load the CSV file into a DataFrame
data = pd.read_excel(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Data Preprocessing**

In [500]:
data.shape

(7285, 49)

In [501]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7285 entries, 0 to 7284
Data columns (total 49 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Date                         7285 non-null   object        
 1   EPF No                       6816 non-null   object        
 2   Time                         0 non-null      float64       
 3   Employee No                  6270 non-null   float64       
 4   Name With Initials           7273 non-null   object        
 5   Calling Name                 7274 non-null   object        
 6   Department                   7114 non-null   object        
 7   Roster                       6267 non-null   object        
 8   Shift                        6361 non-null   object        
 9   Symptoms                     6514 non-null   object        
 10  Reporting Type               7191 non-null   object        
 11  Treatments                   337 non-null  

In [502]:
# replace unwanted values with nan
# data = data.replace('#DIV/0!', np.nan)

# returns the total number of missing values for each column
data.isnull().sum()

Unnamed: 0,0
Date,0
EPF No,469
Time,7285
Employee No,1015
Name With Initials,12
Calling Name,11
Department,171
Roster,1018
Shift,924
Symptoms,771


In [503]:
data

Unnamed: 0,Date,EPF No,Time,Employee No,Name With Initials,Calling Name,Department,Roster,Shift,Symptoms,...,Eno,Complain,Communicable Diseases,Hospital Stay/ Home Stay,Return to work Date,Plant,Contact Number,ID,Item Type,Path
0,26/06/2024 11:18,26813,,1008008.0,Y.M.W.G.Mallika,Mallika,Manufacturing,Team G - Shift B,B,body Pain,...,,,,,NaT,,,10000,Item,sites/BEKTMedicalCenter/Lists/Patient_Details_...
1,26/06/2024 11:18,,,,Sujeewa,Sujeewa,Third party,,,Back Pain,...,,,,,NaT,,,10001,Item,sites/BEKTMedicalCenter/Lists/Patient_Details_...
2,26/06/2024 14:08,28616,,1045160.0,W.D.ASHOKAMALA,Dilhani,Manufacturing,Team 11 - Shift A,A,Eye Pain,...,,,,,NaT,,,10002,Item,sites/BEKTMedicalCenter/Lists/Patient_Details_...
3,26/06/2024 14:09,29057,,1084133.0,H.M.C.D.Herath,Chamila,Manufacturing,Team 23 - Shift A,A,Cold,...,,,,,NaT,,,10003,Item,sites/BEKTMedicalCenter/Lists/Patient_Details_...
4,26/06/2024 14:09,23185,,98753.0,R.Jelastina,Jelastina,Manufacturing,Team 09 - Shift A,A,Cold,...,,,,,NaT,,,10004,Item,sites/BEKTMedicalCenter/Lists/Patient_Details_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7280,26/06/2024 09:41,28062,,1031581.0,J.A.M.S.Jayasingha,Maduwanthi,Manufacturing,Team 12 - Shift B,B,Tooth Pain,...,,,,,NaT,,,9995,Item,sites/BEKTMedicalCenter/Lists/Patient_Details_...
7281,26/06/2024 09:42,29529,,,Susanthi,Susanthi,Manufacturing,,,"Cold ,Fever",...,,,,,NaT,,,9996,Item,sites/BEKTMedicalCenter/Lists/Patient_Details_...
7282,26/06/2024 09:45,29569,,,Milinda,Milinda,Cutting,,,Cough,...,,,,,NaT,,,9997,Item,sites/BEKTMedicalCenter/Lists/Patient_Details_...
7283,26/06/2024 09:47,,,,Kugandya,Kugandya,Manufacturing,,,"Allergies,Rash",...,,,,,NaT,,,9998,Item,sites/BEKTMedicalCenter/Lists/Patient_Details_...


In [504]:
#retrieve all columns of data
data.columns

Index(['Date', 'EPF No', 'Time', 'Employee No', 'Name With Initials',
       'Calling Name', 'Department', 'Roster', 'Shift', 'Symptoms',
       'Reporting Type', 'Treatments', 'Paracetamol', 'Vitamin B', 'Vitamin C',
       ' Digene Tablet', 'Crape Bandage', 'Samahan', 'Jeewani',
       'Bed Rest In time', 'Bed Rest Out time', 'Entered By', 'Hospital Visit',
       'Hospital Name', 'Date of Visit', 'Visited Nurse ', 'Remark', 'Reason',
       'Brandix Employee/Contractor', 'Created', 'Created By', 'Modified By',
       'Modified', 'Hospital Refer/ Sent Home', 'BP ', 'Pulse ', 'Temperature',
       'SpO2', 'Patient Type', 'Eno', 'Complain', 'Communicable Diseases',
       'Hospital Stay/ Home Stay', 'Return to work Date', 'Plant',
       'Contact Number', 'ID', 'Item Type', 'Path'],
      dtype='object')

In [505]:
# drop unwanted columns in data
columns = ['Date','EPF No','Department','Shift','Symptoms']
data = data[columns]

In [506]:
#remove all the null row values
data = data.dropna()
data

Unnamed: 0,Date,EPF No,Department,Shift,Symptoms
0,26/06/2024 11:18,26813,Manufacturing,B,body Pain
2,26/06/2024 14:08,28616,Manufacturing,A,Eye Pain
3,26/06/2024 14:09,29057,Manufacturing,A,Cold
4,26/06/2024 14:09,23185,Manufacturing,A,Cold
5,26/06/2024 14:10,23887,Manufacturing,A,Cold
...,...,...,...,...,...
7276,26/06/2024 09:26,21603,Technical,G,Loose Motion
7278,26/06/2024 09:33,21182,Manufacturing,B,Cold
7279,26/06/2024 09:40,12864,Manufacturing,B,Hand Pain
7280,26/06/2024 09:41,28062,Manufacturing,B,Tooth Pain


In [507]:
# Count the occurrences of each department and synptoms
department_counts = data['Department'].value_counts()
symptoms_counts = data['Symptoms'].value_counts()

# Print the results
print("Department Counts:")
print(department_counts)

print("\nSymptoms Counts:")
print(symptoms_counts)

Department Counts:
Department
Manufacturing                   3385
Finishing                        474
Warehouse                        339
Quality Assurance                323
Cutting                          275
Training School                  149
RM Inspection                     78
Machine Maintenance               77
Technical                         73
Medical Centre                    59
Central Inspection Facility       50
Pilot Zone                        43
Facility Management               37
Industrial Engineering            31
Operations                        24
General Administration            16
Indirect                          13
Finishing & Packing               12
Planning                          11
Occupational Health & Safety       9
Works HR                           9
Direct                             8
Packing                            7
HR                                 4
Executive                          3
HR & Administration                3
Training

In [508]:
# Encode categorical variables
label_encoders = {}
for column in ['Department', 'Symptoms']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

In [509]:
import re
# Function to clean date strings
def clean_date(date_str):

    date_str = re.sub(r'\.', ':', date_str)
    date_str = re.sub(r'(\d{1,2})([ap]m)', r'\1 \2', date_str)  # Handle am/pm without space
    return date_str

In [510]:
# Apply the function to the Date column
data['Date'] = data['Date'].apply(clean_date)

# Convert the cleaned Date column to datetime
data['Date'] = pd.to_datetime(data['Date'], errors='coerce', infer_datetime_format=True)
data['year'] = data['Date'].dt.year
data['day_of_week'] = data['Date'].dt.dayofweek
data['month'] = data['Date'].dt.month

  data['Date'] = pd.to_datetime(data['Date'], errors='coerce', infer_datetime_format=True)
  data['Date'] = pd.to_datetime(data['Date'], errors='coerce', infer_datetime_format=True)


In [511]:
data


Unnamed: 0,Date,EPF No,Department,Shift,Symptoms,year,day_of_week,month
0,2024-06-26 11:18:00,26813,24,B,373,2024.0,2.0,6.0
2,2024-06-26 14:08:00,28616,24,A,92,2024.0,2.0,6.0
3,2024-06-26 14:09:00,29057,24,A,69,2024.0,2.0,6.0
4,2024-06-26 14:09:00,23185,24,A,69,2024.0,2.0,6.0
5,2024-06-26 14:10:00,23887,24,A,69,2024.0,2.0,6.0
...,...,...,...,...,...,...,...,...
7276,2024-06-26 09:26:00,21603,39,G,207,2024.0,2.0,6.0
7278,2024-06-26 09:33:00,21182,24,B,69,2024.0,2.0,6.0
7279,2024-06-26 09:40:00,12864,24,B,148,2024.0,2.0,6.0
7280,2024-06-26 09:41:00,28062,24,B,323,2024.0,2.0,6.0


In [512]:
# Aggregate symptoms count by department and date
aggregated_data = data.groupby(['year', 'month', 'day_of_week', 'Department']).agg({
    'Symptoms': lambda x: x.mode()[0],  # Most common symptom for that day
}).reset_index()


In [513]:
# Create lag features for symptoms
aggregated_data['symptom_lag_1'] = aggregated_data.groupby('Department')['Symptoms'].shift(1)
aggregated_data['symptom_lag_2'] = aggregated_data.groupby('Department')['Symptoms'].shift(2)
aggregated_data.dropna(inplace=True)

In [514]:
aggregated_data

Unnamed: 0,year,month,day_of_week,Department,Symptoms,symptom_lag_1,symptom_lag_2
13,2024.0,5.0,1.0,24,69,69.0,69.0
20,2024.0,5.0,2.0,1,125,155.0,155.0
22,2024.0,5.0,2.0,9,69,155.0,373.0
24,2024.0,5.0,2.0,24,69,69.0,69.0
29,2024.0,5.0,2.0,34,148,69.0,69.0
...,...,...,...,...,...,...,...
188,2024.0,7.0,5.0,24,155,155.0,69.0
189,2024.0,7.0,5.0,34,69,69.0,125.0
190,2024.0,7.0,5.0,35,155,69.0,125.0
191,2024.0,7.0,5.0,40,155,117.0,69.0


### Sepearting the features and target

In [515]:
# Define features and target variable
X = aggregated_data[['year', 'month', 'day_of_week', 'Department', 'symptom_lag_1', 'symptom_lag_2']]
y = aggregated_data['Symptoms']


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=80)


### Predicting symptoms using randon forest

In [516]:
# Initialize and train the model
model = RandomForestClassifier(n_estimators=1000, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))


Accuracy: 0.4772727272727273
              precision    recall  f1-score   support

          43       0.00      0.00      0.00         0
          69       0.53      0.79      0.63        24
         125       0.00      0.00      0.00         5
         148       0.00      0.00      0.00         1
         155       0.33      0.22      0.27         9
         264       0.00      0.00      0.00         1
         266       0.00      0.00      0.00         1
         307       0.00      0.00      0.00         1
         323       0.00      0.00      0.00         2

    accuracy                           0.48        44
   macro avg       0.10      0.11      0.10        44
weighted avg       0.36      0.48      0.40        44



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### User Input

In [527]:
from datetime import datetime, timedelta

department = 'Manufacturing'
input_date = datetime.strptime('2024-08-06', '%Y-%m-%d')

# Create a DataFrame with the same structure as the training data
input_data = pd.DataFrame({
    'year': [input_date.year],
    'month': [input_date.month],
    'day_of_week': [input_date.weekday()],
    'Department': [label_encoders['Department'].transform([department])[0]],
    'symptom_lag_1': [0],
    'symptom_lag_2': [0]
})

predicted_symptoms = model.predict(input_data)
print(f'Predicted symptoms for {department} on {input_date.date()}: {label_encoders["Symptoms"].inverse_transform(predicted_symptoms)[0]}')


Predicted symptoms for Manufacturing on 2024-08-06: Cold
