In [1]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data = pd.read_csv('/kaggle/input/titanic-machine-learning-from-disaster/train.csv')
test = pd.read_csv('/kaggle/input/titanic-machine-learning-from-disaster/test.csv')
test_ids = test['PassengerId']

/kaggle/input/titanic-machine-learning-from-disaster/train.csv
/kaggle/input/titanic-machine-learning-from-disaster/test.csv


In [2]:
def clean(data):
    
    data=data.drop(["Ticket", "Cabin", "Name", "PassengerId"], axis=1)
    
    cols = ['Age', 'Fare', 'Parch', 'SibSp']
    for col in cols:
        data[col].fillna(data[col].median(), inplace=True)
        
    data.Embarked.fillna('U', inplace=True)
    return data

In [3]:
data=clean(data)
test=clean(test)

print(test.isnull().sum())
print(data.isnull().sum())

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [4]:
le = LabelEncoder()

cols = ['Sex', 'Embarked']
for col in cols:
    data[col] = le.fit_transform(data[col])
    test[col] = le.transform(test[col])
    
    print(le.classes_)

['female' 'male']
['C' 'Q' 'S' 'U']


In [5]:
lr = LogisticRegression()

y = data['Survived']
X = data.drop('Survived', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
print(X_train.isnull().sum())

X_train['Age'].fillna(value=X_train['Age'].mean(), inplace=True)
X_test['Age'].fillna(value=X_test['Age'].mean(), inplace=True)

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [7]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

In [8]:
predictions = clf.predict(X_test)
test_score = accuracy_score(predictions, y_test)

print(int(test_score * 1000) / 10)

81.0


In [9]:
submission_predictions = clf.predict(test)

df = pd.DataFrame({'PassengerId': test_ids.values,
                   'Survived': submission_predictions,})

df.to_csv('./submission.csv', index=False)