##  5. Filling missing values with a Regression Model :

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('Titanic_Kaggle.csv', usecols=['Survived','Pclass','Sex','Age','SibSp','Parch','Fare'])
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


## 1. Check missing values :

In [3]:
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
dtype: int64

### Note :
> * We have missing values in __'Age'__ variable.
> * Before going further we will change the categorical variable into numerical by __Label-Encoding__
> * So it would be easier for operations.

## 2. Label-Encoding :

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex'])

new_data = data

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.4 KB


## 3. Building Linear Regression Model : 📈
> * In this case, the null values in one column are filled by fitting a __regression model__ using other columns in the dataset.
> * i.e in this case the regression model will contain all the columns except Age in X and Age in Y. (since we are filling missing values in Age column by fitting a linear model)
> * Then after filling the values in the Age column, then we will use logistic regression to calculate accuracy.

In [6]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()

data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [9]:
x_test_df = data[data['Age'].isnull()==True]
x_train_df = data[data['Age'].isnull()==False]

y_train = x_train_df['Age']
x_train_df.drop('Age', axis = 1, inplace = True)

linear.fit(x_train_df, y_train)

x_test_df.drop('Age', axis=1, inplace=True)

pred = linear.predict(x_test_df)

x_test_df['Age'] = pred

In [10]:
x_test_df.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Age
5,0,3,1,0,0,8.4583,29.070801
17,1,2,1,0,0,13.0,30.108333
19,1,3,0,0,0,7.225,22.446851
26,0,3,1,0,0,7.225,29.089273
28,1,3,0,0,0,7.8792,22.437052


In [11]:
x_train_df['Age'] = y_train

y_train = x_train_df['Survived']

x_train_df.drop('Survived', axis=1, inplace=True)

## 4. Logistic-Regression :

In [12]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()

logistic.fit(x_train_df, y_train)

LogisticRegression()

In [13]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

LogisticRegression()

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [15]:
y_test = x_test_df['Survived']
x_test_df.drop('Survived', axis=1, inplace=True)

pred2 = logistic.predict(x_test_df)
print('Accuracy:', round(accuracy_score(pred2, y_test)*100,2),'%')

Accuracy: 83.62 %


In [16]:
cm = confusion_matrix(y_test, pred2)
print(cm)

[[112  13]
 [ 16  36]]


# Note : 📝
>* We got accuracy of 83.62%.
> * Then we pick this model as it gives more accuracy.
>* In confusion matrix if you check first diagonal, i.e 112, 36 are our accurate predictions and 13, 16 are our errors.
>* We can also use models  KNN for filling the missing values. But sometimes, using models for imputation can result in overfitting the data.
>* __But you have to understand that There is no perfect way for filling the missing values in a dataset.__
>* Each of the methods that we've used, may work well with different types of datasets. 
>* You have to experiment through different methods, to check which method works the best for your dataset.