In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc

In [3]:
# Load the dataset
df = pd.read_csv('titanic-2.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
# Check for missing values
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [6]:
# Display basic statistics for the 'Age' feature
print(df['Age'].describe())


count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64


In [8]:
# Find the range of ages
min_age = df['Age'].min()
max_age = df['Age'].max()

print(f"Age range: {min_age} to {max_age} years")


Age range: 0.42 to 80.0 years


In [9]:
# Define fare ranges
fare_bins = [0, 10, 30, 50, 512.33]  # Adjust the maximum value based on your dataset
fare_labels = ['0 to $10', '$10 to $30', '$30 to $50', 'More than $50']

# Categorize fares into bins
df['Fare_Range'] = pd.cut(df['Fare'], bins=fare_bins, labels=fare_labels)

# Display the counts for each fare range
fare_range_counts = df['Fare_Range'].value_counts()
print(fare_range_counts)


Fare_Range
0 to $10         321
$10 to $30       321
More than $50    160
$30 to $50        74
Name: count, dtype: int64


In [10]:
# Display basic statistics for the 'SibSp' feature
print(df['SibSp'].describe())


count    891.000000
mean       0.523008
std        1.102743
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        8.000000
Name: SibSp, dtype: float64


In [12]:
# Find the maximum number of siblings/spouses
max_sibsp = df['SibSp'].max()

print(f"Maximum number of siblings/spouses: {max_sibsp}")


Maximum number of siblings/spouses: 8


In [13]:
# Check the unique values in the 'SibSp' column
unique_sibsp_values = df['SibSp'].unique()
print("Unique SibSp values:", unique_sibsp_values)


Unique SibSp values: [1 0 3 4 2 5 8]


In [14]:
# Define ranges for SibSp
sibsp_bins = [-1, 0, 1, 2, 3, 4]  # Adjust the bins as necessary
sibsp_labels = ['0', '1', '2', '3', '4+']

# Categorize SibSp values into bins
df['SibSp_Range'] = pd.cut(df['SibSp'], bins=sibsp_bins, labels=sibsp_labels)

# Display the counts for each SibSp range
sibsp_range_counts = df['SibSp_Range'].value_counts()
print(sibsp_range_counts)


SibSp_Range
0     608
1     209
2      28
4+     18
3      16
Name: count, dtype: int64


In [16]:
# Count the number of unique SibSp ranges
num_ranges = df['SibSp_Range'].nunique()
print(f"Number of different SibSp ranges: {num_ranges}")


Number of different SibSp ranges: 5


In [18]:
# Filter passengers who did not survive
not_survived = df[df['Survived'] == 0]


In [19]:
# Get the range of Fare for passengers who did not survive
fare_range = not_survived['Fare'].describe()

# Display the Fare range
print(f"Fare range for passengers who did not survive:")
print(fare_range[['min', 'max']])


Fare range for passengers who did not survive:
min      0.0
max    263.0
Name: Fare, dtype: float64


In [20]:
# Define fare ranges
bins = [-1, 10, 30, 50, float('inf')]  # The last bin includes all fares greater than 50
labels = ['Less than $10', '$10 to $30', '$30 to $50', 'More than $50']

# Create a new column to categorize Fare into ranges
not_survived['Fare_Range'] = pd.cut(not_survived['Fare'], bins=bins, labels=labels)

# Count the number of passengers in each fare range
fare_range_counts = not_survived['Fare_Range'].value_counts()
print(fare_range_counts)


Fare_Range
Less than $10    269
$10 to $30       182
More than $50     51
$30 to $50        47
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_survived['Fare_Range'] = pd.cut(not_survived['Fare'], bins=bins, labels=labels)


In [22]:
# Get the unique values in the 'Parch' column
unique_parch_values = df['Parch'].unique()
print("Unique Parch values:", unique_parch_values)

# Get the range of Parch
parch_range = df['Parch'].describe()
print(f"Parch range: Min = {parch_range['min']}, Max = {parch_range['max']}")


Unique Parch values: [0 1 2 5 3 4 6]
Parch range: Min = 0.0, Max = 6.0


In [23]:
# Create a new column for titles
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Filter passengers with the title "Mrs."
mrs_passengers = df[df['Title'] == 'Mrs']


  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


In [24]:
# Get the range of Age for passengers with the title "Mrs."
age_range = mrs_passengers['Age'].describe()

# Display the Age range
print(f"Age range for passengers with the title 'Mrs.':")
print(f"Min Age: {age_range['min']}, Max Age: {age_range['max']}")


Age range for passengers with the title 'Mrs.':
Min Age: 14.0, Max Age: 63.0


In [26]:
# Filter passengers with Pclass = 3
third_class_passengers = df[df['Pclass'] == 3]


In [27]:
# Get the statistics for the Fare column for third-class passengers
fare_statistics = third_class_passengers['Fare'].describe()

# Display the statistics
print("Fare Statistics for Third-Class Passengers:")
print(fare_statistics)


Fare Statistics for Third-Class Passengers:
count    491.000000
mean      13.675550
std       11.778142
min        0.000000
25%        7.750000
50%        8.050000
75%       15.500000
max       69.550000
Name: Fare, dtype: float64


In [28]:
# Create a filter for passengers with the title "Mrs."
mrs_passengers = df[df['Name'].str.contains('Mrs.')]


In [29]:
# Get the age range for passengers with the title "Mrs."
min_age = mrs_passengers['Age'].min()
max_age = mrs_passengers['Age'].max()

# Display the results
print(f"Minimum Age: {min_age}, Maximum Age: {max_age}")


Minimum Age: 14.0, Maximum Age: 63.0
