In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
titanic = pd.read_csv(url)

print("First few rows of the dataset:")
print(titanic.head())


print("\nBasic information about the dataset:")
print(titanic.info())


print("\nMissing values in the dataset:")
print(titanic.isnull().sum())


titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)


if 'Embarked' in titanic.columns:
    titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)
else:
    print("Column 'Embarked' does not exist in the dataset. Skipping this step.")


titanic = pd.get_dummies(titanic, columns=['Sex', 'Embarked'], drop_first=True)

titanic.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True, errors='ignore')


print("\nFirst few rows of the processed dataset:")
print(titanic.head())


if 'Survived' in titanic.columns:
    X = titanic.drop('Survived', axis=1)
    y = titanic['Survived']
else:
    raise KeyError("Column 'Survived' does not exist in the dataset.")


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"\nAccuracy: {accuracy}")
print("\nConfusion Matrix:")
print(cm)
print("\nClassification Report:")
print(report)