In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer

In [78]:
df = pd.read_csv("/content/Titanic-Dataset_logicalregression.csv")

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [79]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [13]:
print('Percent of missing "Cabin" records is %.2f%%' %((df['Cabin'].isnull().sum()/df.shape[0])*100))
print('Percent of missing "Age" records is %.2f%%' %((df['Age'].isnull().sum()/df.shape[0])*100))

Percent of missing "Cabin" records is 77.10%
Percent of missing "Age" records is 19.87%


In [75]:
median_age = df["Age"].median(skipna=True)
print('The median of "Age" is %.2f' %(median_age))
df["Age"].fillna(median_age, inplace=True)
df

The median of "Age" is 28.00


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(median_age, inplace=True)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [76]:
if 'Cabin' in df.columns:
    df.drop('Cabin', axis=1, inplace=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C


In [25]:
df["Embarked"].fillna(df['Embarked'].value_counts().idxmax(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna(df['Embarked'].value_counts().idxmax(), inplace=True)


In [31]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Name,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0
TravelAlone,0
IsMinor,0


In [65]:
columns_to_drop = ['Name', 'Ticket', 'PassengerId',  'Fare', 'Pclass_2','Pclass_3']
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
if existing_columns_to_drop:
    df.drop(existing_columns_to_drop, axis=1, inplace=True)

In [66]:
if 'Survived' in df.columns:
    X = df.drop('Survived', axis=1)
    y = df['Survived']
else:
    print("Error: 'Survived' column not found in the DataFrame.")

    X = None


In [67]:
if X is not None:

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [68]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [69]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nConfus   ion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy Score: 0.7985074626865671

Confusion Matrix:
 [[137  20]
 [ 34  77]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.87      0.84       157
           1       0.79      0.69      0.74       111

    accuracy                           0.80       268
   macro avg       0.80      0.78      0.79       268
weighted avg       0.80      0.80      0.80       268



In [70]:


print("Please enter the details of the passenger:")


sex_input = input("Sex (male/female): ").strip().lower()

sex_male = 1 if sex_input == 'male' else 0


age = float(input("Age (e.g., 28): "))
sibsp = int(input("Number of siblings/spouses aboard (SibSp): "))
parch = int(input("Number of parents/children aboard (Parch): "))



embarked_input = input("Port of embarkation (Q = Queenstown, S = Southampton, C = Cherbourg): ").strip().upper()


travel_alone = 1 if (sibsp + parch) == 0 else 0


embarked_Q = 1 if embarked_input == 'Q' else 0
embarked_S = 1 if embarked_input == 'S' else 0

user_data = pd.DataFrame([[
    age,
    sibsp,
    parch,
    travel_alone,
    embarked_Q,
    embarked_S,
    sex_male
]], columns=['Age', 'SibSp', 'Parch',  'TravelAlone',  'Embarked_Q', 'Embarked_S', 'Sex_male'])



try:
    user_data_scaled = scaler.transform(user_data)


    prediction = model.predict(user_data_scaled)


    print("Prediction:  Survived!" if prediction[0] == 1 else "Prediction:  Did not survive.")

except NameError as e:
    print(f"Error: {e}. Make sure 'scaler' and 'model' objects are defined by running the previous cells.")
except ValueError as e:
     print(f"Error: {e}. Ensure the input features match the features used for training.")

Please enter the details of the passenger:
Sex (male/female): male
Age (e.g., 28): 22
Number of siblings/spouses aboard (SibSp): 1
Number of parents/children aboard (Parch): 0
Port of embarkation (Q = Queenstown, S = Southampton, C = Cherbourg): s
Prediction:  Did not survive.
