In [None]:
# Import libraries
import pandas as pd  # data manipulation
import seaborn as sns  # creation of attractive graphics
import matplotlib.pyplot as plt  # data visualizations and plotting graphs
from sklearn.model_selection import train_test_split  # split dataset into training and testing
from sklearn.ensemble import RandomForestClassifier  # classification tasks
from sklearn.metrics import classification_report, confusion_matrix  # evaluate the performance of the model
from sklearn.preprocessing import StandardScaler  # feature scaling

# Load the Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
titanic_data = pd.read_csv(url)

# Drop non-numeric columns that are not needed and handle missing data properly
titanic_data = titanic_data[['Pclass', 'Parch','SibSp', 'Fare', 'Embarked',  'Sex', 'Age', 'Survived']]

# Rename columns for consistency
titanic_data.columns = ["passenger_class", "parents_children", "siblings_spouses", "fare", "embarked", "sex", "age", "survived"]


# Encode categorical variable 'sex'
titanic_data['sex'] = titanic_data['sex'].map({'male': 0, 'female': 1})

# One-hot encode the 'embarked' column
titanic_data = pd.get_dummies(titanic_data, columns=['embarked'], drop_first=True)

# Display the first few rows
print("Display the First Few Rows:\n")
print(titanic_data.head())

# Data Types and Missing Values
print("\nData Types and Missing Values:\n")
print(titanic_data.info())

# Summary of Missing Values
missing_values = titanic_data.isnull().sum()
print("\nSummary of Missing Values:\n")
print(missing_values[missing_values > 0])

# Fill missing values in 'age' with the mean age
titanic_data['age'] = titanic_data['age'].fillna(titanic_data['age'].mean())
# Ensure there are no missing values in the 'sex' column, though encoding handled it
titanic_data['sex'] = titanic_data['sex'].fillna(titanic_data['sex'].mode()[0])

# Confirm Missing Values Are Handled
print("\nSummary of Missing Values After Handling:\n")
print(titanic_data.isnull().sum())

# Display a pair plot of the Titanic dataset with survival status as hue
sns.pairplot(titanic_data, hue='survived')
plt.show()

# Splitting the data into features and target variable
X = titanic_data.drop('survived', axis=1)  # Features
y = titanic_data['survived']  # Target

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying feature scaling to standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Creating and training the Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
