In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score
from sklearn.preprocessing import LabelEncoder

# Data Processing

In [2]:
# Load the dataset
 
df = pd.read_csv('/work/Titanic-Dataset.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Preprocessing
# Encoding categorical data
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])  # Convert 'Sex' to numerical

df['Embarked'].fillna('S', inplace=True)  # Fill missing 'Embarked' with 'S' for Southampton
#df['Embarked'] = label_encoder.fit_transform(df['Embarked'])  # Convert 'Embarked' to numerical
df = pd.get_dummies(df, columns=['Embarked'])
# Convert True/False values to 0/1
df['Embarked_C'] = df['Embarked_C'].astype(int)
df['Embarked_Q'] = df['Embarked_Q'].astype(int)
df['Embarked_S'] = df['Embarked_S'].astype(int)

# Handling missing values
df['Age'].fillna(df['Age'].median(), inplace=True)  # Replace missing 'Age' with median value

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,0,0,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,0,0,1


1. Select the features you intend to use as independent variables and identify your target (dependent) variable. Split the data into training and testing sets. Create a logistic regression classifier and fit the model.

In [6]:
# Independent Variables
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
# Dependent Variable
y = df['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a logistic regression classifier
clf = LogisticRegression()

# Fit the model
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


2. Utilize your model to make predictions on the testing data, calculate evaluation metrics such as accuracy and recall, and print the results.

In [7]:
# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
## Accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

## Recall
recall = recall_score(y_test, y_pred)
print('Recall:', recall)

Accuracy: 0.8044692737430168
Recall: 0.7297297297297297


3. Display the theta parameter values.

In [8]:
# DisplayTheta parameters values
theta = clf.coef_
print('Theta:', theta)

Theta: [[-0.81348658 -2.49646632 -0.02524414 -0.24483755 -0.12245511  0.00323061
   1.23133623  0.68274734  0.64198736]]


4. Create a DataFrame with 3 records (for 3 persons), use your model to make predictions, and print the predicted results using text descriptions such as 'survived' and 'not survived'.

In [9]:
# Creating a DataFrame with 3 records for prediction
new_data = pd.DataFrame({
    'Pclass': [3, 1, 2],
    'Sex': [1, 0, 0],  # 1 for male, 0 for female
    'Age': [22, 38, 26],
    'SibSp': [1, 1, 0],
    'Parch': [0, 0, 0],
    'Fare': [7.25, 71.2833, 7.925],
    'Embarked_C': [0, 1, 0],
    'Embarked_Q': [0, 0, 0],
    'Embarked_S': [1, 0, 1]
})

# Making predictions on the new data
predictions = clf.predict(new_data)

# Interpreting the predictions
predicted_results = ['survived' if pred == 1 else 'not survived' for pred in predictions]

# Printing the results
for i, result in enumerate(predicted_results):
    print(f'Person {i+1}: {result}')

Person 1: not survived
Person 2: survived
Person 3: survived


5. Alter the training/testing split fraction and the maximum iteration of the logistic regression model, observe and print the different outcomes.

Experiment on different train/testing 

In [10]:
def evaludate_predicted_result():
    # Creating a DataFrame with 3 records for prediction
    new_data = pd.DataFrame({
        'Pclass': [3, 1, 2],
        'Sex': [1, 0, 0],  # 1 for male, 0 for female
        'Age': [22, 38, 26],
        'SibSp': [1, 1, 0],
        'Parch': [0, 0, 0],
        'Fare': [7.25, 71.2833, 7.925],
        'Embarked_C': [0, 1, 0],
        'Embarked_Q': [0, 0, 0],
        'Embarked_S': [1, 0, 1]
    })

    # Making predictions on the new data
    predictions = clf.predict(new_data)

    # Interpreting the predictions
    predicted_results = ['survived' if pred == 1 else 'not survived' for pred in predictions]

    # Printing the results
    for i, result in enumerate(predicted_results):
        print(f'Person {i+1}: {result}')

In [11]:
def evaluate_split_ratio(X, y, test_size=0.2, random_state=42):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Create a logistic regression classifier
    clf = LogisticRegression()

    # Fit the model
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)
    
    # Evaluate the model
    print('================================')
    print('Split ratio:', test_size)
    ## Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy:', accuracy)

    ## Recall
    recall = recall_score(y_test, y_pred)
    print('Recall:', recall)

    # DisplayTheta parameters values
    theta = clf.coef_
    print('Theta:', theta)

    evaludate_predicted_result()

    return accuracy, recall, theta

Experiment on maximum iteration of the logistic regression model

In [12]:
def evaludate_iteration_logistic_regression_model(X_train, X_test, y_train, y_test, max_iter=100):    
    
    # Create a logistic regression classifier
    clf = LogisticRegression(max_iter=max_iter)

    # Fit the model
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the model
    print('Maximum iteration:', max_iter)
    print('================================')
    ## Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy:', accuracy)

    ## Recall
    recall = recall_score(y_test, y_pred)
    print('Recall:', recall)

    # DisplayTheta parameters values
    theta = clf.coef_
    print('Theta:', theta)

    evaludate_predicted_result()

    return accuracy, recall, theta

In [15]:
# Correct the for loop to include both 'X' and 'y' when calling evaluate_split_ratio
split_ratios = [0.1, 0.2, 0.3, 0.4, 0.5]

# print("EXPERIMENT 1: VARYING TRAINING/TESTING SPLIT RATIOS")
# for ratio in split_ratios:
#     acc, rec, theta = evaluate_split_ratio(X, y, test_size=ratio)
#     results_split.append((ratio, acc, rec, theta))


results_split = []

print("EXPERIMENT 1: VARYING TRAINING/TESTING SPLIT RATIOS")
for ratio in split_ratios:
    acc, rec, theta = evaluate_split_ratio(X, y, test_size=ratio)
    results_split.append((ratio, acc, rec, theta))

# Create a DataFrame from the results list
df = pd.DataFrame(results_split, columns=['Split Ratio', 'Accuracy', 'Recall', 'Theta'])

# Export the DataFrame to a CSV file without the index column
df.to_csv('experiment1_results.csv', index=False)
pd.read_csv('experiment1_results.csv')

EXPERIMENT 1: VARYING TRAINING/TESTING SPLIT RATIOS
Split ratio: 0.1
Accuracy: 0.8444444444444444
Recall: 0.8333333333333334
Theta: [[-9.49156527e-01 -2.49250451e+00 -2.82503313e-02 -3.00047601e-01
  -6.62590873e-02  2.40868079e-03  1.11198589e+00  9.41373517e-01
   7.30204263e-01]]
Person 1: not survived
Person 2: survived
Person 3: survived
Split ratio: 0.2
Accuracy: 0.8044692737430168
Recall: 0.7297297297297297
Theta: [[-0.81348658 -2.49646632 -0.02524414 -0.24483755 -0.12245511  0.00323061
   1.23133623  0.68274734  0.64198736]]
Person 1: not survived
Person 2: survived
Person 3: survived
Split ratio: 0.3
Accuracy: 0.8134328358208955
Recall: 0.7297297297297297
Theta: [[-0.87169114 -2.4519397  -0.03133665 -0.27834019 -0.11208717  0.00336619
   1.08873953  0.96670924  0.5605745 ]]
Person 1: not survived
Person 2: survived
Person 3: survived
Split ratio: 0.4
Accuracy: 0.7955182072829131
Recall: 0.6808510638297872
Theta: [[-0.85231032 -2.54213437 -0.02853285 -0.26433618 -0.08143101  0.

Unnamed: 0,Split Ratio,Accuracy,Recall,Theta
0,0.1,0.844444,0.833333,[[-9.49156527e-01 -2.49250451e+00 -2.82503313e...
1,0.2,0.804469,0.72973,[[-0.81348658 -2.49646632 -0.02524414 -0.24483...
2,0.3,0.813433,0.72973,[[-0.87169114 -2.4519397 -0.03133665 -0.27834...
3,0.4,0.795518,0.680851,[[-0.85231032 -2.54213437 -0.02853285 -0.26433...
4,0.5,0.807175,0.709497,[[-8.58604974e-01 -2.40497191e+00 -2.53750571e...


In [16]:
# Test different iteration values
iteration_values = [100, 200, 300, 400, 500]
results_iteration = []

print("EXPERIMENT 2: VARYING MAXIMUM ITERATION VALUES")
for max_iter in iteration_values:
    acc, rec, theta = evaludate_iteration_logistic_regression_model(X_train, X_test, y_train, y_test, max_iter)
    results_iteration.append((max_iter, acc, rec, theta))

# Create a DataFrame from the results list
df = pd.DataFrame(results_iteration, columns=['Max Iteration', 'Accuracy', 'Recall', 'Theta'])

# Export the DataFrame to a CSV file without the index column
df.to_csv('experiment2_results.csv', index=False)
pd.read_csv('experiment2_results.csv')

EXPERIMENT 2: VARYING MAXIMUM ITERATION VALUES
Maximum iteration: 100
Accuracy: 0.8044692737430168
Recall: 0.7297297297297297
Theta: [[-0.81348658 -2.49646632 -0.02524414 -0.24483755 -0.12245511  0.00323061
   1.23133623  0.68274734  0.64198736]]
Person 1: not survived
Person 2: survived
Person 3: survived
Maximum iteration: 200
Accuracy: 0.7932960893854749
Recall: 0.7162162162162162
Theta: [[-0.8769898  -2.62707344 -0.02740663 -0.30956262 -0.10599505  0.00327132
   0.32862063  0.21747075 -0.10548095]]
Person 1: not survived
Person 2: survived
Person 3: survived
Maximum iteration: 300
Accuracy: 0.8100558659217877
Recall: 0.7432432432432432
Theta: [[-9.37088012e-01 -2.58923238e+00 -3.05815751e-02 -2.94188315e-01
  -1.08057868e-01  2.54019141e-03  1.81773347e-01  4.26209639e-02
  -2.36975610e-01]]
Person 1: not survived
Person 2: survived
Person 3: survived
Maximum iteration: 400
Accuracy: 0.8100558659217877
Recall: 0.7432432432432432
Theta: [[-9.37088012e-01 -2.58923238e+00 -3.05815751e

Unnamed: 0,Max Iteration,Accuracy,Recall,Theta
0,100,0.804469,0.72973,[[-0.81348658 -2.49646632 -0.02524414 -0.24483...
1,200,0.793296,0.716216,[[-0.8769898 -2.62707344 -0.02740663 -0.30956...
2,300,0.810056,0.743243,[[-9.37088012e-01 -2.58923238e+00 -3.05815751e...
3,400,0.810056,0.743243,[[-9.37088012e-01 -2.58923238e+00 -3.05815751e...
4,500,0.810056,0.743243,[[-9.37088012e-01 -2.58923238e+00 -3.05815751e...


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=85a56391-2891-43eb-a2ac-238501749167' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>