In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
data = fetch_openml("titanic", version=1, as_frame= True)
df = data.frame

In [3]:
print(data.DESCR)

**Author**: Frank E. Harrell Jr., Thomas Cason  
**Source**: [Vanderbilt Biostatistics](http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html)  
**Please cite**:   

The original Titanic dataset, describing the survival status of individual passengers on the Titanic. The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. The principal source for data about Titanic passengers is the Encyclopedia Titanica. The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.

Thomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variable

In [4]:
print(df.columns)

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')


In [5]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [6]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [7]:
df.drop(columns=['name', 'cabin', 'body', 'home.dest', 'ticket', 'boat'], inplace=True)

In [8]:
def missing_val(df):
    for col in df.columns:
        if df[col].dtype == 'object' or df[col].dtype.name == 'category':
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].mean())
missing_val(df)

In [9]:
df.isnull().sum()

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

In [10]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

encoder = OneHotEncoder(drop='first', sparse_output=False) 
encoded = encoder.fit_transform(df[['sex']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['sex']))

# Combine with original df (after dropping original 'sex' column)
df = df.drop('sex', axis=1)
df = pd.concat([df, encoded_df], axis=1)

print(df.head())


   pclass survived      age  sibsp  parch      fare embarked  sex_male
0       1        1  29.0000      0      0  211.3375        S       0.0
1       1        1   0.9167      1      2  151.5500        S       1.0
2       1        0   2.0000      1      2  151.5500        S       0.0
3       1        0  30.0000      1      2  151.5500        S       1.0
4       1        0  25.0000      1      2  151.5500        S       0.0


In [11]:
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded = encoder.fit_transform(df[['embarked']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['embarked']))

df = df.drop('embarked', axis=1)
df = pd.concat([df, encoded_df], axis=1)
print(df.head())

   pclass survived      age  sibsp  parch      fare  sex_male  embarked_Q  \
0       1        1  29.0000      0      0  211.3375       0.0         0.0   
1       1        1   0.9167      1      2  151.5500       1.0         0.0   
2       1        0   2.0000      1      2  151.5500       0.0         0.0   
3       1        0  30.0000      1      2  151.5500       1.0         0.0   
4       1        0  25.0000      1      2  151.5500       0.0         0.0   

   embarked_S  
0         1.0  
1         1.0  
2         1.0  
3         1.0  
4         1.0  


In [12]:
df.dtypes

pclass           int64
survived      category
age            float64
sibsp            int64
parch            int64
fare           float64
sex_male       float64
embarked_Q     float64
embarked_S     float64
dtype: object

In [13]:
X = df.drop(columns='survived') # all the features except the survived 
y= df['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [14]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
result = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
}) 
print(result.head())

     Actual Predicted
201       0         0
115       0         0
255       1         1
1103      0         0
195       1         1


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
