In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the dataset
titanic_df = pd.read_csv('/content/Titanic-Dataset.csv')

In [4]:
print("First few rows of the Titanic dataset:")
print(titanic_df.head())

First few rows of the Titanic dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0        

In [5]:
# Preprocessing the data
# Drop columns that won't be used in the prediction
print(titanic_df.columns)
# Fill missing values
titanic_df['Age'].fillna(titanic_df['Age'].median(), inplace=True)
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode()[0], inplace=True)
# Convert categorical columns to numerical
le = LabelEncoder()
titanic_df['Sex'] = le.fit_transform(titanic_df['Sex'])
titanic_df['Embarked'] = le.fit_transform(titanic_df['Embarked'])
# Define features and target
X =titanic_df.drop(columns='Survived')
y = titanic_df['Survived']

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [8]:
# Inspect columns of X_train
print(X_train.dtypes)

# Drop non-numeric columns from X and re-split the data
X = titanic_df.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin'])  # Assuming 'Name', 'Ticket', and 'Cabin' are non-numeric
y = titanic_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

PassengerId      int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked         int64
dtype: object


In [9]:
# Build and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [10]:
# Make predictions
y_pred = model.predict(X_test)
# Map predictions to survival status
predicted_survival = ["Survived" if pred == 1 else "Did not survive" for pred in y_pred]

In [11]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")


Model Accuracy: 0.83


In [12]:
# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["passengers Did not survive", "passengers Survived"]))


Classification Report:
                            precision    recall  f1-score   support

passengers Did not survive       0.83      0.89      0.86       105
       passengers Survived       0.82      0.74      0.78        74

                  accuracy                           0.83       179
                 macro avg       0.83      0.81      0.82       179
              weighted avg       0.83      0.83      0.83       179



In [13]:
import numpy as np

sample_indices = X_test[:10]
sample_results = pd.DataFrame({
    'Predicted': predicted_survival[:10],
    'Actual': ["Survived" if actual == 1 else "Did not survive" for actual in y_test[:10]]
})
print("\nSample Predictions:")
print(sample_results)



Sample Predictions:
         Predicted           Actual
0  Did not survive         Survived
1  Did not survive  Did not survive
2  Did not survive  Did not survive
3         Survived         Survived
4  Did not survive         Survived
5         Survived         Survived
6         Survived         Survived
7  Did not survive  Did not survive
8         Survived         Survived
9         Survived         Survived
