In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the data and exploitory data analysis

In [6]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [7]:
train_data.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker


### Let's check what each column represents
- Patient : The ID of a patient admitted in hospital
- Weeks : The followup week of the Patient
- FVC :  the recorded lung capacity in milliliter
- Percent : a computed field which approximates the patient's FVC as a percent of the typical FVC for a person of similar characteristics
- Age : Patients age
- Sex: Patients sex
- SmokingStatus : Smokes or not?

In [18]:
#Checking train dataframe info
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1549 entries, 0 to 1548
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        1549 non-null   object 
 1   Weeks          1549 non-null   int64  
 2   FVC            1549 non-null   int64  
 3   Percent        1549 non-null   float64
 4   Age            1549 non-null   int64  
 5   Sex            1549 non-null   object 
 6   SmokingStatus  1549 non-null   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 84.8+ KB


In [19]:
#Checking test dataframe info
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        5 non-null      object 
 1   Weeks          5 non-null      int64  
 2   FVC            5 non-null      int64  
 3   Percent        5 non-null      float64
 4   Age            5 non-null      int64  
 5   Sex            5 non-null      object 
 6   SmokingStatus  5 non-null      object 
dtypes: float64(1), int64(3), object(3)
memory usage: 408.0+ bytes


In [20]:
# Statistical Analysis of train data
train_data.describe()

Unnamed: 0,Weeks,FVC,Percent,Age
count,1549.0,1549.0,1549.0,1549.0
mean,31.861846,2690.479019,77.672654,67.188509
std,23.24755,832.770959,19.823261,7.057395
min,-5.0,827.0,28.877577,49.0
25%,12.0,2109.0,62.8327,63.0
50%,28.0,2641.0,75.676937,68.0
75%,47.0,3171.0,88.621065,72.0
max,133.0,6399.0,153.145378,88.0


In [21]:
# Statistical Analysis of test data
test_data.describe()

Unnamed: 0,Weeks,FVC,Percent,Age
count,5.0,5.0,5.0,5.0
mean,8.8,2781.6,75.997702,71.8
std,7.049823,516.475846,4.97096,2.167948
min,0.0,1930.0,70.186855,68.0
25%,6.0,2739.0,71.824968,72.0
50%,6.0,2925.0,76.672493,73.0
75%,15.0,3020.0,79.258903,73.0
max,17.0,3294.0,82.045291,73.0


In [22]:
##Checking whether there is any empty values in both train and test dataset
train_data.isnull().sum()

Patient          0
Weeks            0
FVC              0
Percent          0
Age              0
Sex              0
SmokingStatus    0
dtype: int64

In [23]:
test_data.isnull().sum()

Patient          0
Weeks            0
FVC              0
Percent          0
Age              0
Sex              0
SmokingStatus    0
dtype: int64

- No null values present in both train and test set

In [24]:
train_data.groupby(['Sex','SmokingStatus'])['FVC'].agg(['mean','std','count']).sort_values(by=['Sex','count'],ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,count
Sex,SmokingStatus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,Ex-smoker,2886.024705,769.105923,931
Male,Never smoked,2878.034934,667.921434,229
Male,Currently smokes,3300.828125,694.108554,64
Female,Never smoked,1775.985,584.208226,200
Female,Ex-smoker,1901.906542,450.294485,107
Female,Currently smokes,2868.833333,71.124787,18


- FVC has the highest rate with the people that still smokes. However, because the data for the smoking is quite low, we can't really generalise that people who smoke has high FVC.

In [25]:
test_data.groupby(['Sex','SmokingStatus'])['FVC'].agg(['mean','std','count']).sort_values(by=['Sex','count'],ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,count
Sex,SmokingStatus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,Ex-smoker,2745.75,589.147619,4
Male,Never smoked,2925.0,,1


#### Counting the number of patients in both train and test set

In [35]:
#Train data
print('train_data patient count is : {}'.format(train_data['Patient'].count()))

train_data patient count is : 1549


In [36]:
#Test data
print('test_data patient count is : {}'.format(test_data['Patient'].count()))

test_data patient count is : 5


In [39]:
#Unique patient in training data
print('train_data unique patient count is : {}'.format(train_data['Patient'].nunique()))

train_data unique patient count is : 176


In [41]:
#Unique patient in testing data
print('test_data unique patient count is : {}'.format(test_data['Patient'].nunique()))

test_data unique patient count is : 5
