# I.  Simple Linear Model using Quantile Regression

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

In [2]:
from statsmodels.formula.api import quantreg

In [3]:
train_df = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/train.csv")
test_df = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/test.csv")

In [4]:
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker


In [5]:
test_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,6,3020,70.186855,73,Male,Ex-smoker
1,ID00421637202311550012437,15,2739,82.045291,68,Male,Ex-smoker
2,ID00422637202311677017371,6,1930,76.672493,73,Male,Ex-smoker
3,ID00423637202312137826377,17,3294,79.258903,72,Male,Ex-smoker
4,ID00426637202313170790466,0,2925,71.824968,73,Male,Never smoked


In [6]:
submission_df = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/sample_submission.csv")
submission_df.head()

Unnamed: 0,Patient_Week,FVC,Confidence
0,ID00419637202311204720264_-12,2000,100
1,ID00421637202311550012437_-12,2000,100
2,ID00422637202311677017371_-12,2000,100
3,ID00423637202312137826377_-12,2000,100
4,ID00426637202313170790466_-12,2000,100


In [7]:
submission_df['Weeks'] = submission_df['Patient_Week'].apply( lambda x: int(x.split('_')[-1]) )
submission_df['Patient'] = submission_df['Patient_Week'].apply( lambda x: (x.split('_')[0]) )
submission_df.head()

Unnamed: 0,Patient_Week,FVC,Confidence,Weeks,Patient
0,ID00419637202311204720264_-12,2000,100,-12,ID00419637202311204720264
1,ID00421637202311550012437_-12,2000,100,-12,ID00421637202311550012437
2,ID00422637202311677017371_-12,2000,100,-12,ID00422637202311677017371
3,ID00423637202312137826377_-12,2000,100,-12,ID00423637202312137826377
4,ID00426637202313170790466_-12,2000,100,-12,ID00426637202313170790466


In [8]:
print("Number of patients in the Training set: ", train_df['Patient'].nunique())
print("Number of patients in the Testing set: ", test_df['Patient'].nunique())
print("Number of patients in the Submission set: ", submission_df['Patient'].nunique())

Number of patients in the Training set:  176
Number of patients in the Testing set:  5
Number of patients in the Submission set:  5


### Aggregating FVC according to Sex, Smoking Status
### Aggregating FVC according to Sex, Age

In [9]:
train_df.groupby(['Sex', 'SmokingStatus'])['FVC'].agg(['mean', 'std', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,count
Sex,SmokingStatus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,Currently smokes,2868.833333,71.124787,18
Female,Ex-smoker,1901.906542,450.294485,107
Female,Never smoked,1775.985,584.208226,200
Male,Currently smokes,3300.828125,694.108554,64
Male,Ex-smoker,2886.024705,769.105923,931
Male,Never smoked,2878.034934,667.921434,229


In [10]:
train_df.groupby(['Sex', 'Age'])['FVC'].agg(['mean', 'std', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,count
Sex,Age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,49,2915.333333,49.52777,9
Female,52,1501.333333,160.199875,9
Female,55,1709.0,130.055192,18
Female,56,1782.777778,114.376328,9
Female,57,1328.0,204.937185,9
Female,58,1818.111111,117.547272,9
Female,61,1455.722222,108.010787,18
Female,62,1731.058824,287.478797,17
Female,63,2201.875,165.148797,8
Female,64,3096.333333,121.47119,9


### Categorizing/Factorizing

In [11]:
# Easy method to categorize variables -- Factorize
train_df['Sex'] = pd.factorize(train_df['Sex'])[0]
train_df['SmokingStatus'] = pd.factorize(train_df['SmokingStatus'])[0]
train_df.tail(20)

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
1529,ID00422637202311677017371,35,1862,73.971079,73,0,0
1530,ID00422637202311677017371,47,1713,68.051804,73,0,0
1531,ID00423637202312137826377,17,3294,79.258903,72,0,0
1532,ID00423637202312137826377,18,2777,66.819057,72,0,0
1533,ID00423637202312137826377,19,2700,64.966314,72,0,0
1534,ID00423637202312137826377,21,3014,72.521655,72,0,0
1535,ID00423637202312137826377,23,2661,64.027911,72,0,0
1536,ID00423637202312137826377,30,2778,66.843118,72,0,0
1537,ID00423637202312137826377,42,2516,60.53898,72,0,0
1538,ID00423637202312137826377,53,2432,58.517806,72,0,0


In [12]:
test_df['Sex'] = pd.factorize(test_df['Sex'])[0]
test_df['SmokingStatus'] = pd.factorize(test_df['SmokingStatus'])[0]
test_df

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,6,3020,70.186855,73,0,0
1,ID00421637202311550012437,15,2739,82.045291,68,0,0
2,ID00422637202311677017371,6,1930,76.672493,73,0,0
3,ID00423637202312137826377,17,3294,79.258903,72,0,0
4,ID00426637202313170790466,0,2925,71.824968,73,0,1


### Normalizing

Since FVC is something we need to predict, we go on normalizing all other independent variables

In [13]:
train_df['Percent'] = (train_df['Percent']-np.mean(train_df['Percent']))/np.std(train_df['Percent'])
#train_df['Weeks'] = (train_df['Weeks']-np.mean(train_df['Weeks']))/np.std(train_df['Weeks'])
train_df['Age'] = (train_df['Age']-np.mean(train_df['Age']))/np.std(train_df['Age'])
train_df['Sex'] = (train_df['Sex']-np.mean(train_df['Sex']))/np.std(train_df['Sex'])
train_df['SmokingStatus'] = (train_df['SmokingStatus']-np.mean(train_df['SmokingStatus']))/np.std(train_df['SmokingStatus'])

In [14]:
test_df['Percent'] = (test_df['Percent']-np.mean(test_df['Percent']))/np.std(test_df['Percent'])
#test_df['Weeks'] = (test_df['Weeks']-np.mean(test_df['Weeks']))/np.std(test_df['Weeks'])
test_df['Age'] = (test_df['Age']-np.mean(test_df['Age']))/np.std(test_df['Age'])
# don't normalize Sex since all in test set are male  -- normalizing this variable gives NaN 
#test_df['Sex'] = (test_df['Sex']-np.mean(test_df['Sex']))/np.std(test_df['Sex'])
test_df['SmokingStatus'] = (test_df['SmokingStatus']-np.mean(test_df['SmokingStatus']))/np.std(test_df['SmokingStatus'])

In [15]:
train_df.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,-0.979923,1.674174,-0.515289,-0.654482
1,ID00007637202177411956430,5,2214,-1.108174,1.674174,-0.515289,-0.654482
2,ID00007637202177411956430,7,2061,-1.302454,1.674174,-0.515289,-0.654482
3,ID00007637202177411956430,9,2144,-1.19706,1.674174,-0.515289,-0.654482
4,ID00007637202177411956430,11,2069,-1.292296,1.674174,-0.515289,-0.654482


In [16]:
test_df

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,6,3020,-1.306936,0.618853,0,-0.5
1,ID00421637202311550012437,15,2739,1.360182,-1.9597,0,-0.5
2,ID00422637202311677017371,6,1930,0.151769,0.618853,0,-0.5
3,ID00423637202312137826377,17,3294,0.733487,0.103142,0,-0.5
4,ID00426637202313170790466,0,2925,-0.938503,0.618853,0,2.0


### Using Quantile Regression Model

In [17]:
from statsmodels.formula.api import quantreg

modelL = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus', train_df).fit(q=0.15)
model = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus', train_df).fit(q=0.50)
modelU = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus', train_df).fit(q=0.85)

In [18]:
model.summary()

0,1,2,3
Dep. Variable:,FVC,Pseudo R-squared:,0.6093
Model:,QuantReg,Bandwidth:,119.3
Method:,Least Squares,Sparsity:,647.3
Date:,"Thu, 20 May 2021",No. Observations:,1549.0
Time:,13:43:59,Df Residuals:,1543.0
,,Df Model:,5.0

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2715.9319,13.994,194.080,0.000,2688.483,2743.381
Weeks,-0.6357,0.355,-1.789,0.074,-1.333,0.061
Percent,644.9273,8.413,76.659,0.000,628.425,661.429
Age,-117.1388,8.314,-14.089,0.000,-133.448,-100.830
Sex,-419.5776,8.677,-48.357,0.000,-436.597,-402.558
SmokingStatus,-8.8332,8.738,-1.011,0.312,-25.973,8.306


In [19]:
y_true = test_df[['FVC']]
y_true

Unnamed: 0,FVC
0,3020
1,2739
2,1930
3,3294
4,2925


In [20]:
test_df = test_df.drop(['FVC'], axis=1)
test_df

Unnamed: 0,Patient,Weeks,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,6,-1.306936,0.618853,0,-0.5
1,ID00421637202311550012437,15,1.360182,-1.9597,0,-0.5
2,ID00422637202311677017371,6,0.151769,0.618853,0,-0.5
3,ID00423637202312137826377,17,0.733487,0.103142,0,-0.5
4,ID00426637202313170790466,0,-0.938503,0.618853,0,2.0


In [21]:
test_df['FVC_PredL'] = modelL.predict(test_df).values
test_df['FVC_Pred'] = model.predict(test_df).values
test_df['FVC_PredU'] = modelU.predict(test_df).values

# applying the quantile absolute deviation formula
test_df['FVC_Pred_std'] = 0.5*np.abs(test_df['FVC_Pred']-test_df['FVC_PredL']) + 0.5*np.abs(test_df['FVC_PredU']-test_df['FVC_Pred'])

# Confidence of prediction
test_df['Confidence'] = np.abs(test_df['FVC_PredU'] - test_df['FVC_PredL'])/2
test_df

Unnamed: 0,Patient,Weeks,Percent,Age,Sex,SmokingStatus,FVC_PredL,FVC_Pred,FVC_PredU,FVC_Pred_std,Confidence
0,ID00419637202311204720264,6,-1.306936,0.618853,0,-0.5,1546.300455,1801.163894,1974.627332,214.163438,214.163438
1,ID00421637202311550012437,15,1.360182,-1.9597,0,-0.5,3473.053212,3817.587969,4360.645335,443.796062,443.796062
2,ID00422637202311677017371,6,0.151769,0.618853,0,-0.5,2410.514507,2741.922638,3077.065564,333.275528,333.275528
3,ID00423637202312137826377,17,0.733487,0.103142,0,-0.5,2815.695348,3170.504748,3591.434881,387.869766,387.869766
4,ID00426637202313170790466,0,-0.938503,0.618853,0,2.0,1628.175711,2020.507858,2100.507565,236.165927,236.165927
