In [54]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import warnings

In [55]:
warnings.filterwarnings("ignore")

In [56]:
df = pd.read_csv("Assets/Titanic_Dataset.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [57]:
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [58]:
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2


In [59]:
df = df.drop(columns = ['Name'], axis = 1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,1,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,0,35.0,1,0,113803,53.1,C123,2
4,5,0,3,1,35.0,0,0,373450,8.05,,2


In [60]:
df = df.drop(columns = ['Ticket'], axis = 1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,1,22.0,1,0,7.25,,2
1,2,1,1,0,38.0,1,0,71.2833,C85,0
2,3,1,3,0,26.0,0,0,7.925,,2
3,4,1,1,0,35.0,1,0,53.1,C123,2
4,5,0,3,1,35.0,0,0,8.05,,2


In [61]:
df = df.drop(columns = ['Cabin'], axis = 1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


In [62]:
df = df[["Survived", "Pclass", "Age","Sex", "SibSp", "Parch", "Fare"]]
df.head()

Unnamed: 0,Survived,Pclass,Age,Sex,SibSp,Parch,Fare
0,0,3,22.0,1,1,0,7.25
1,1,1,38.0,0,1,0,71.2833
2,1,3,26.0,0,0,0,7.925
3,1,1,35.0,0,1,0,53.1
4,0,3,35.0,1,0,0,8.05


In [63]:
# Split the data into rows with and without missing Age
known_age = df[df['Age'].notnull()]
unknown_age = df[df['Age'].isnull()]

In [64]:
X_train = known_age.drop(columns = ["Age"])
y_train = known_age["Age"]

In [65]:
X_missing_age = unknown_age.drop(columns = ["Age"])

In [66]:
X_missing_age

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare
5,0,3,1,0,0,8.4583
17,1,2,1,0,0,13.0000
19,1,3,0,0,0,7.2250
26,0,3,1,0,0,7.2250
28,1,3,0,0,0,7.8792
...,...,...,...,...,...,...
859,0,3,1,0,0,7.2292
863,0,3,0,8,2,69.5500
868,0,3,1,0,0,9.5000
878,0,3,1,0,0,7.8958


In [67]:
#X_train = known_age.drop(columns = ["Name"], inplace=True)
#X_train = known_age.drop(columns = ["Ticket"], inplace=True)
X_train

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare
0,0,3,1,1,0,7.2500
1,1,1,0,1,0,71.2833
2,1,3,0,0,0,7.9250
3,1,1,0,1,0,53.1000
4,0,3,1,0,0,8.0500
...,...,...,...,...,...,...
885,0,3,0,0,5,29.1250
886,0,2,1,0,0,13.0000
887,1,1,0,0,0,30.0000
889,1,1,1,0,0,30.0000


In [68]:
model = LinearRegression()
model.fit(X_train, y_train)

In [74]:
model.coef_

array([-6.93305353, -8.03861316, -0.29063071, -4.00072406, -0.8209266 ,
       -0.01497836])

In [75]:
model.intercept_

53.60396234398061

In [69]:
# Step 3: Predict missing Age values
predicted_ages = model.predict(X_missing_age)

In [70]:
predicted_ages

array([29.07080066, 30.10833306, 22.44685065, 29.08927347, 22.43705181,
       29.07922599, 32.43692984, 22.43898701, 22.15615704, 29.07922599,
       29.07691632, 24.96460346, 22.43898701, 20.8713251 , 37.80993305,
       44.85950626, 17.21443083, 29.07922599, 29.07691632, 22.43842532,
       29.07691632, 29.07691632, 29.07922599, 22.14798185, 18.1926178 ,
       29.07691632, 29.08140983, 17.39852793, 27.61791252, 29.08796287,
       29.06774208, -5.49189866, 36.98755908, 44.88640441, 15.9929439 ,
       -5.20126796, 37.01068094, 44.52580031, 18.32218064, 29.08140983,
       22.43898701, -5.49189866, 25.08068578, 29.07922599, 16.2835746 ,
       29.37503621, 25.27089854, 18.32218064, 29.08889901, 37.44600929,
       29.08140983, 29.37204054, 44.81038921, 22.43898701, 37.23610531,
       44.88528103, 44.85950626, 37.88482487, 22.43898701, 13.91474356,
       30.40869971, 29.07691632, 36.97144531, -5.49189866, 14.20537427,
       32.62971335, 29.07922599, 18.31319362, 44.75047576, 29.08

In [71]:
df.loc[df['Age'].isnull(), 'Age'] = predicted_ages

In [73]:
df.isnull().sum()

Survived    0
Pclass      0
Age         0
Sex         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [76]:
df.to_csv("Assets/predict_age_titanic.csv",header= True)