In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import os
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
le_1 = LabelEncoder()
le_2 = LabelEncoder()

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')

In [3]:
trainData = []
for p in train['Patient'].unique():
    patientData = train[train['Patient'] == p]
    firstMeasure = list(patientData.iloc[0, :].values)
    for i, week in enumerate(patientData['Weeks'].iloc[1:]):
        fvc = patientData.iloc[i, 2]
        trainDataPoint = firstMeasure + [week, fvc]
        trainData.append(trainDataPoint)
trainData = pd.DataFrame(trainData)

trainData.columns = ['PatientID', 'first_week', 'first_FVC', 'first_Percent', 'Age', 'Sex', 'SmokingStatus'] + ['target_week', 'target_FVC']
trainData['delta_week'] = trainData['target_week'] - trainData['first_week']
trainData.drop(columns = ['first_Percent', 'target_week', 'first_week'], inplace = True)

In [4]:
# create testing data
subSplit = np.array(list(sub['Patient_Week'].apply(lambda x: x.split('_')).values))
testData = []
for p in np.unique(subSplit[:, 0]):
    patientData = test[test['Patient'] == p]
    firstMeasure = list(patientData.iloc[0, :].values)
    for week in subSplit[subSplit[:, 0] == p, 1]:
        testDataPoint = firstMeasure + [week]
        testData.append(testDataPoint)
testData = pd.DataFrame(testData)
testData.columns = ['PatientID', 'first_week', 'first_FVC', 'first_Percent', 'Age', 'Sex', 'SmokingStatus'] + ['target_week']

testData['delta_week'] = testData['target_week'].map(int) - testData['first_week']
testData.drop(columns = ['first_Percent', 'first_week'], inplace = True)

In [5]:
# fe engineering
# trainData.drop(columns = ['PatientID'], inplace = True)
# testData.drop(columns = ['PatientID'], inplace = True)

trainData['Sex'] = le_1.fit_transform(trainData['Sex'])
testData['Sex'] = le_1.transform(testData['Sex'])

trainData['SmokingStatus'] = le_2.fit_transform(trainData['SmokingStatus'])
testData['SmokingStatus'] = le_2.transform(testData['SmokingStatus'])
trainData

Unnamed: 0,PatientID,first_FVC,Age,Sex,SmokingStatus,target_FVC,delta_week
0,ID00007637202177411956430,2315,79,1,1,2315,9
1,ID00007637202177411956430,2315,79,1,1,2214,11
2,ID00007637202177411956430,2315,79,1,1,2061,13
3,ID00007637202177411956430,2315,79,1,1,2144,15
4,ID00007637202177411956430,2315,79,1,1,2069,21
...,...,...,...,...,...,...,...
1368,ID00426637202313170790466,2925,73,1,2,2976,13
1369,ID00426637202313170790466,2925,73,1,2,2712,19
1370,ID00426637202313170790466,2925,73,1,2,2978,31
1371,ID00426637202313170790466,2925,73,1,2,2908,43


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

model_lr = LinearRegression()
model_lr.fit(trainData.drop(columns = ['PatientID', 'target_FVC']), trainData['target_FVC'])
prediction = model_lr.predict(testData.drop(columns = ['PatientID', 'target_week']))
trainData

Unnamed: 0,PatientID,first_FVC,Age,Sex,SmokingStatus,target_FVC,delta_week
0,ID00007637202177411956430,2315,79,1,1,2315,9
1,ID00007637202177411956430,2315,79,1,1,2214,11
2,ID00007637202177411956430,2315,79,1,1,2061,13
3,ID00007637202177411956430,2315,79,1,1,2144,15
4,ID00007637202177411956430,2315,79,1,1,2069,21
...,...,...,...,...,...,...,...
1368,ID00426637202313170790466,2925,73,1,2,2976,13
1369,ID00426637202313170790466,2925,73,1,2,2712,19
1370,ID00426637202313170790466,2925,73,1,2,2978,31
1371,ID00426637202313170790466,2925,73,1,2,2908,43


In [7]:
testData

Unnamed: 0,PatientID,first_FVC,Age,Sex,SmokingStatus,target_week,delta_week
0,ID00419637202311204720264,3020,73,1,1,-12,-18
1,ID00419637202311204720264,3020,73,1,1,-11,-17
2,ID00419637202311204720264,3020,73,1,1,-10,-16
3,ID00419637202311204720264,3020,73,1,1,-9,-15
4,ID00419637202311204720264,3020,73,1,1,-8,-14
...,...,...,...,...,...,...,...
725,ID00426637202313170790466,2925,73,1,2,129,129
726,ID00426637202313170790466,2925,73,1,2,130,130
727,ID00426637202313170790466,2925,73,1,2,131,131
728,ID00426637202313170790466,2925,73,1,2,132,132


In [8]:
import pickle

In [9]:
pickle.dump(model_lr,open('model_lr.pkl','wb'))
pickle.dump(le_1,open('le_1.pkl','wb'))
pickle.dump(le_2,open('le_2.pkl','wb'))