In [289]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import Binarizer

In [290]:
df = pd.read_csv('train.csv', usecols = ['Sex', 'SibSp', 'Age', 'Survived','Pclass', 'Embarked','Parch', 'PassengerId'], converters ={'Sex': lambda x: 'M' if x=='male' else 'F' if x == 'female' else x})
df.isnull().mean()*100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Embarked        0.224467
dtype: float64

In [291]:
df = df.dropna(subset=['Embarked'])

In [292]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,1,0,3,M,22.0,1,0,S
1,2,1,1,F,38.0,1,0,C
2,3,1,3,F,26.0,0,0,S
3,4,1,1,F,35.0,1,0,S
4,5,0,3,M,35.0,0,0,S
...,...,...,...,...,...,...,...,...
886,887,0,2,M,27.0,0,0,S
887,888,1,1,F,19.0,0,0,S
888,889,0,3,F,,1,2,S
889,890,1,1,M,26.0,0,0,C


In [293]:
df.isnull().mean()*100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Sex             0.000000
Age            19.910011
SibSp           0.000000
Parch           0.000000
Embarked        0.000000
dtype: float64

In [294]:
df['Family'] = df['SibSp']+df['Parch']
df = df.drop(columns = ['SibSp','Parch'])

In [295]:
X = df.drop(columns = ['Survived','PassengerId'])
y = df[['Survived','PassengerId']]

In [296]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [297]:
le = LabelEncoder()
le.fit(X_train['Sex'])
X_train['Sex'] = le.transform(X_train['Sex'])
X_test['Sex'] = le.transform(X_test['Sex'])

In [298]:
trf1 = ColumnTransformer([
    ('trf3', OneHotEncoder(drop = 'first',sparse_output = False, handle_unknown = 'ignore', dtype = np.int32),[3]),
    ('trf4', IterativeImputer(),[2]),
    ('trf5', Binarizer(copy=False), [4])
],remainder = 'passthrough')

In [299]:
trf2 = LogisticRegression()

In [300]:
X_train

Unnamed: 0,Pclass,Sex,Age,Embarked,Family
708,1,0,22.0,S,0
240,3,0,,C,1
382,3,1,32.0,S,0
792,3,0,,S,10
683,3,1,14.0,S,7
...,...,...,...,...,...
107,3,1,,S,0
271,3,1,25.0,S,0
862,1,0,48.0,S,0
436,3,0,21.0,S,4


# now prediction from the test data

In [301]:
test = pd.read_csv('test.csv' , usecols = ['Sex', 'SibSp', 'Age','Pclass', 'Embarked','Parch', 'PassengerId'], converters ={'Sex': lambda x: 'M' if x=='male' else 'F' if x == 'female' else x})

In [302]:
test

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Embarked
0,892,3,M,34.5,0,0,Q
1,893,3,F,47.0,1,0,S
2,894,2,M,62.0,0,0,Q
3,895,3,M,27.0,0,0,S
4,896,3,F,22.0,1,1,S
...,...,...,...,...,...,...,...
413,1305,3,M,,0,0,S
414,1306,1,F,39.0,0,0,C
415,1307,3,M,38.5,0,0,S
416,1308,3,M,,0,0,S


In [303]:
test['Sex'] = le.transform(test['Sex'])

In [304]:
test_final = test['PassengerId']
test = test.drop(columns = ['PassengerId'])
test_final.shape

(418,)

In [305]:
test['Family'] = test['SibSp'] + test['Parch']
test = test.drop(columns = ['SibSp','Parch'])

In [306]:
test

Unnamed: 0,Pclass,Sex,Age,Embarked,Family
0,3,1,34.5,Q,0
1,3,0,47.0,S,1
2,2,1,62.0,Q,0
3,3,1,27.0,S,0
4,3,0,22.0,S,2
...,...,...,...,...,...
413,3,1,,S,0
414,1,0,39.0,C,0
415,3,1,38.5,S,0
416,3,1,,S,0


In [307]:
trf1.fit(X_train)

In [320]:
X_train_trf = trf1.transform(X_train)
X_test_trf = trf1.transform(X_test)

In [321]:
trf2.fit(X_train_trf,y_train['Survived'])

In [323]:
y_predict = trf2.predict(X_test_trf)

In [324]:
accuracy_score(y_predict,y_test['Survived'])

0.7808988764044944

In [325]:
test_trf = trf1.transform(test)

In [326]:
Survived = trf2.predict(test_trf)

In [327]:
Survived = pd.DataFrame(Survived, columns  = ['Survived'])
Survived

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0


In [328]:
Submission_final = pd.concat([test_final,Survived], axis =1)

In [329]:
Submission_final

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [319]:
Submission_final.to_csv('Submission_final.csv', index =False)