In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
titanic_train = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')

test_passenger_ids = titanic_test['PassengerId']
n_train = titanic_train.shape[0]
full_df = pd.concat([titanic_train, titanic_test], ignore_index=True)

In [None]:
def preprocessing(df):
    df.fillna({'Embarked': df['Embarked'].mode()[0]}, inplace=True)

    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Capt', 'Col', 'Countess', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Mlle', 'Mme', 'Ms', 'Rev', 'Sir', 'Dona'], 'Rare')
    age_medians_by_value = df.groupby('Title')['Age'].transform('median')
    df['Age'] = df['Age'].fillna(age_medians_by_value)

    fare_medians_by_value = df.groupby('Title')['Fare'].transform('median')
    df['Fare'] = df['Fare'].fillna(fare_medians_by_value)

    df['Cabin'] = df['Cabin'].fillna('U')

    df['Deck'] = df['Cabin'].str[0]

    df.drop('Name', axis=1, inplace=True)
    df.drop('Ticket', axis=1, inplace=True)
    df.drop('Cabin', axis=1, inplace=True)

    dummies_emb = pd.get_dummies(df['Embarked'], prefix='Embarked').astype(int)
    df = pd.concat([df, dummies_emb], axis=1)
    df.drop('Embarked', axis=1, inplace=True)

    dummies_deck = pd.get_dummies(df['Deck'], prefix='Deck').astype(int)
    df = pd.concat([df, dummies_deck], axis=1)
    df.drop('Deck', axis=1, inplace=True)

    dummies_title = pd.get_dummies(df['Title'], prefix='Title').astype(int)
    df = pd.concat([df, dummies_title], axis=1)
    df.drop('Title', axis=1, inplace=True)

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df.drop('SibSp', axis=1, inplace=True)
    df.drop('Parch', axis=1, inplace=True)

    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

    return df

In [None]:
processed_df = preprocessing(full_df)
train_df = processed_df.iloc[:n_train]
test_df = processed_df.iloc[n_train:]

In [None]:
X = train_df.drop(['Survived', 'PassengerId'], axis=1)
y = train_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11) #11
model = LogisticRegression(max_iter=200, random_state=3)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions)}")
print(classification_report(y_test, predictions))

In [None]:
X_full_train = train_df.drop(['Survived', 'PassengerId'], axis=1)
y_full_train = train_df['Survived']
final_model = LogisticRegression(max_iter=500, random_state=11)
final_model.fit(X_full_train, y_full_train)

X_full_test = test_df.drop(['PassengerId','Survived'], axis=1)
predictions = final_model.predict(X_full_test)
print(predictions)

In [None]:
submission_df = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Survived': predictions
})
submission_df['Survived'] = submission_df['Survived'].astype(int)
submission_df.to_csv('submission.csv', index=False)