In [1]:
#Import the required modules
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Read the dataset
physio = pd.read_csv('dataset.csv')
physio.head()

Unnamed: 0,Age,Affected body part,Diagnosis,Duration of pain,Intensity of pain,Treatment approach,Duration of treatment
0,24.0,knee,patella fracture,chronic,9 on nprs,manual and mechanical,22 days
1,35.0,knee,patella fracture,chronic,6 on nprs,manual,40 days
2,26.0,knee,patella fracture,chronic,8 on nprs,mechanical,31 days
3,21.0,knee,patella fracture,subacute,7 on nprs,manual and mechanical,15 days
4,48.0,knee,patella fracture,subacute,8 on nprs,manual and mechanical,17 days


In [3]:
#Converting intensity of pain and treatment duration to integer values from string
physio['Intensity of pain'] = physio['Intensity of pain'].apply(lambda x:int(x.split()[0]))
physio['Duration of treatment'] = physio['Duration of treatment'].apply(lambda x:int(x.split()[0]))
physio.head()

Unnamed: 0,Age,Affected body part,Diagnosis,Duration of pain,Intensity of pain,Treatment approach,Duration of treatment
0,24.0,knee,patella fracture,chronic,9,manual and mechanical,22
1,35.0,knee,patella fracture,chronic,6,manual,40
2,26.0,knee,patella fracture,chronic,8,mechanical,31
3,21.0,knee,patella fracture,subacute,7,manual and mechanical,15
4,48.0,knee,patella fracture,subacute,8,manual and mechanical,17


In [4]:
#Splitting diagnosis into separate columns
diagnosis = pd.get_dummies(physio['Diagnosis'], drop_first=True)
physio = pd.concat([physio, diagnosis], axis = 1)
physio.drop(['Diagnosis'], axis = 1, inplace = True)
physio.head()

Unnamed: 0,Age,Affected body part,Duration of pain,Intensity of pain,Treatment approach,Duration of treatment,frozen shoulder,jennis elbow,lumbar radiculopathy,osteoarthritis,patella fracture,plantar fasciitis
0,24.0,knee,chronic,9,manual and mechanical,22,0,0,0,0,1,0
1,35.0,knee,chronic,6,manual,40,0,0,0,0,1,0
2,26.0,knee,chronic,8,mechanical,31,0,0,0,0,1,0
3,21.0,knee,subacute,7,manual and mechanical,15,0,0,0,0,1,0
4,48.0,knee,subacute,8,manual and mechanical,17,0,0,0,0,1,0


In [5]:
#Assigning integer values to treatment approach and pain duration
physio['Treatment approach'].replace(['manual','mechanical','manual and mechanical'], [0,1,2], inplace=True)
physio['Duration of pain'].replace(['acute','subacute','chronic'], [0,1,2], inplace=True)
physio.head()

Unnamed: 0,Age,Affected body part,Duration of pain,Intensity of pain,Treatment approach,Duration of treatment,frozen shoulder,jennis elbow,lumbar radiculopathy,osteoarthritis,patella fracture,plantar fasciitis
0,24.0,knee,2,9,2,22,0,0,0,0,1,0
1,35.0,knee,2,6,0,40,0,0,0,0,1,0
2,26.0,knee,2,8,1,31,0,0,0,0,1,0
3,21.0,knee,1,7,2,15,0,0,0,0,1,0
4,48.0,knee,1,8,2,17,0,0,0,0,1,0


In [6]:
#Checking for null values
physio.isnull().sum()

Age                      46
Affected body part        0
Duration of pain          0
Intensity of pain         0
Treatment approach        0
Duration of treatment     0
frozen shoulder           0
jennis elbow              0
lumbar radiculopathy      0
osteoarthritis            0
patella fracture          0
plantar fasciitis         0
dtype: int64

In [7]:
#Since age has many null values, drop age
#We dont need affected body part column since the prediction is based on the diagnosis so drop that too
physio.drop(['Age','Affected body part'], axis=1, inplace=True)
physio.head()

Unnamed: 0,Duration of pain,Intensity of pain,Treatment approach,Duration of treatment,frozen shoulder,jennis elbow,lumbar radiculopathy,osteoarthritis,patella fracture,plantar fasciitis
0,2,9,2,22,0,0,0,0,1,0
1,2,6,0,40,0,0,0,0,1,0
2,2,8,1,31,0,0,0,0,1,0
3,1,7,2,15,0,0,0,0,1,0
4,1,8,2,17,0,0,0,0,1,0


In [8]:
#Check for outliers
outliers = []
def detect_outliers_zscore(data):
    thres = 3
    mean = np.mean(data)
    std = np.std(data)
    # print(mean, std)
    for i in data:
        z_score = (i-mean)/std
        if (np.abs(z_score) > thres):
            outliers.append(i)
    return outliers# Driver code

for col in physio.columns[:4]:
    print(f'Outliers in {col} column : {detect_outliers_zscore(physio[col])}')

Outliers in Duration of pain column : []
Outliers in Intensity of pain column : []
Outliers in Treatment approach column : []
Outliers in Duration of treatment column : [40, 31, 38, 42, 38, 40, 45]


In [9]:
#We need to keep the outlier
#Now assign x and y values and split for training and testing
from sklearn.model_selection import train_test_split
x = physio.drop(['Duration of treatment'], axis=1)
y = physio['Duration of treatment']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=101)

In [10]:
#Define an object of type Linear Regression and train it on our training data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain,ytrain)

LinearRegression()

In [11]:
#Predict the values for training and testing data
pred_Train = model.predict(xtrain)
pred_Test = model.predict(xtest)

In [12]:
#Round off the predicted values
pred_Train = [round(i) for i in pred_Train]
pred_Test = [round(i) for i in pred_Test]

In [13]:
#Calculate errors in predictions
from sklearn.metrics import mean_absolute_error, mean_squared_error
print('Training Error:')
print('MAE =', mean_absolute_error(pred_Train, ytrain))
print('MSE =', mean_squared_error(pred_Train, ytrain))
print('RMSE =', mean_squared_error(pred_Train, ytrain) ** 0.5)
print('Testing Error:')
print('MAE =', mean_absolute_error(pred_Test, ytest))
print('MSE =', mean_squared_error(pred_Test, ytest))
print('RMSE =', mean_squared_error(pred_Test, ytest) ** 0.5)
print('\nTraining Score :', model.score(xtrain, ytrain))
print('Testing Score :', model.score(xtest, ytest))

Training Error:
MAE = 1.6708333333333334
MSE = 6.595833333333333
RMSE = 2.5682354513037415
Testing Error:
MAE = 1.5737704918032787
MSE = 4.032786885245901
RMSE = 2.008179993239127

Training Score : 0.8357240983738086
Testing Score : 0.8611903473654254


In [None]:
#The data needs to be in the following format for prediction
'''
Duration of pain:
    acute: 0
    subacute: 1
    chronic: 2

Intensity of pain:
    On a scale of 0-10

Treatment Approach:
    manual: 0
    mechanical: 1
    manual and mechanical: 2
    
So the final format of input for different diagnosis is:
    Cervical Radiculopathy: [Duration, Intensity, Approach, 0, 0, 0, 0, 0, 0]
    Frozen Shoulder:        [Duration, Intensity, Approach, 1, 0, 0, 0, 0, 0]
    Jennis Elbow:           [Duration, Intensity, Approach, 0, 1, 0, 0, 0, 0]
    Lumbar Radiculopathy:   [Duration, Intensity, Approach, 0, 0, 1, 0, 0, 0]
    Osteoarthritis:         [Duration, Intensity, Approach, 0, 0, 0, 1, 0, 0]
    Patella Fracture:       [Duration, Intensity, Approach, 0, 0, 0, 0, 1, 0]
    Plantar Fasciitis:      [Duration, Intensity, Appraoch, 0, 0, 0, 0, 0, 1]
'''