In [1]:
# 1- Load the dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
# Load the training and testing datasets
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')


In [3]:
# 2- EDA

# Display the first few rows of the training dataset
print(train_data.head())

# Display summary statistics of the training dataset
print(train_data.describe())

# Check for missing values
print(train_data.isnull().sum())



   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [4]:
# Handle missing values
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

# Convert categorical variables to numerical
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})

# Select features
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
X = train_data[features]
y = train_data['Survived']


In [5]:
# 3 train_test split 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# 4- train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [7]:
# 5- Make predictions 
predictions = model.predict(X_val)


In [8]:
#  6- Evaluate Model
accuracy = accuracy_score(y_val, predictions)
print(f'Accuracy: {accuracy}')


Accuracy: 0.8156424581005587


In [9]:
# 7- Make Predictions on Test Data
# Use the trained model to make predictions on the test data

X_test = test_data[features]
test_predictions = model.predict(X_test)


In [10]:
#  8- Create Submission File
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions})
submission.to_csv('submission.csv', index=False)
