In [6]:
# Import the Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import seaborn as sns

In [5]:
# Load Titanic dataset from seaborn
titanic = sns.load_dataset('titanic')
# Split into train and test sets (seaborn only provides one combined dataset)

train = titanic
test = titanic.sample(frac=0.3)  # Creating a dummy test set

# Continue with your original code...
from sklearn.preprocessing import LabelEncoder


In [None]:
# Load the data sets- train and test 
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
# Look at top values of train data 
print(train.head())

In [None]:
# Print the column names of train data
print(train.columns.tolist())


In [None]:
# Print the dimension of train and test data
print(f"Train data dimensions: {train.shape}")
print(f"Test data dimensions: {test.shape}")

In [None]:
# Print the number of missing values in each column of train and test data
print("\nMissing values in train data:")
print(train.isnull().sum())
print("\nMissing values in test data:")
print(test.isnull().sum())

In [None]:
# Data Visualization
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.countplot(x='Survived', data=train)
plt.title('Survival Count')

plt.subplot(2, 2, 2)
sns.countplot(x='Pclass', data=train)
plt.title('Passenger Class Distribution')

plt.subplot(2, 2, 3)
sns.countplot(x='Sex', data=train)
plt.title('Gender Distribution')

plt.subplot(2, 2, 4)
sns.countplot(x='Embarked', data=train)
plt.title('Embarkation Port Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Histogram of Age, Parch, Fare
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(train['Age'].dropna(), bins=30, kde=True)
plt.title('Age Distribution')

plt.subplot(1, 3, 2)
sns.countplot(x='Parch', data=train)
plt.title('Parch Distribution')

plt.subplot(1, 3, 3)
sns.histplot(train['Fare'], bins=30, kde=True)
plt.title('Fare Distribution')

plt.tight_layout()
plt.show()

In [None]:

# Plot histogram of age for survived and did not survived class
plt.figure(figsize=(10, 6))
sns.histplot(data=train, x='Age', hue='Survived', bins=30, kde=True, multiple='stack')
plt.title('Age Distribution by Survival')
plt.show()

In [None]:

# Count class and gender wise survival rate
plt.figure(figsize=(10, 6))
sns.barplot(x='Pclass', y='Survived', hue='Sex', data=train)
plt.title('Survival Rate by Class and Gender')
plt.show()


In [None]:

# Age wise Distribution of Male and Female passengers
plt.figure(figsize=(10, 6))
sns.histplot(data=train, x='Age', hue='Sex', bins=30, kde=True, multiple='stack')
plt.title('Age Distribution by Gender')
plt.show()

In [None]:
# Handling Missing Values
# Replace the missing values in Embarked column by 'S'
train['Embarked'].fillna('S', inplace=True)
test['Embarked'].fillna('S', inplace=True)

In [None]:
# Impute the missing values in Fare column with median value
test['Fare'].fillna(test['Fare'].median(), inplace=True)

In [None]:

# Impute the Age column with random values from same distribution
mean_age = train['Age'].mean()
std_age = train['Age'].std()
null_count = train['Age'].isnull().sum()

random_ages = np.random.randint(mean_age - std_age, mean_age + std_age, size=null_count)
train.loc[train['Age'].isnull(), 'Age'] = random_ages

null_count_test = test['Age'].isnull().sum()
random_ages_test = np.random.randint(mean_age - std_age, mean_age + std_age, size=null_count_test)
test.loc[test['Age'].isnull(), 'Age'] = random_ages_test

In [None]:
# Feature Engineering
# Create FamilySize
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [None]:
# Create IsAlone
train['IsAlone'] = train['FamilySize'] == 1
test['IsAlone'] = test['FamilySize'] == 1

In [None]:
# Create Has_cabin
train['Has_cabin'] = ~train['Cabin'].isnull()
test['Has_cabin'] = ~test['Cabin'].isnull()

In [None]:
# Drop unnecessary columns
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train.drop(columns_to_drop, axis=1, inplace=True)
test.drop(columns_to_drop, axis=1, inplace=True)


In [None]:


# Model Building
# Prepare X and y
X = train.drop('Survived', axis=1)
y = train['Survived']


In [None]:


# Label encoding of categorical features
label_encoders = {}
categorical_cols = ['Sex', 'Embarked', 'IsAlone', 'Has_cabin']

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

In [None]:

# Do a pairplot of all features
sns.pairplot(pd.concat([X, y], axis=1), hue='Survived')
plt.show()

In [None]:

# Splitting the data into train and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)





In [None]:
# Create and fit the model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)


In [None]:
# Print the tree
plt.figure(figsize=(20, 10))
plot_tree(dt_model, filled=True, feature_names=X.columns, class_names=['Died', 'Survived'], max_depth=3)
plt.show()

In [None]:

# Check evaluation metrics on validation set
y_pred = dt_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

print(f"\nAccuracy: {accuracy_score(y_val, y_pred):.2f}")