In [1]:
import pandas as pd

# Load the train.csv file
train_df = pd.read_csv('train.csv')

# Display the first few rows of the train dataset
train_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# Check for missing values in the dataset
missing_values = train_df.isnull().sum()
missing_values


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
# Fill missing values in Age with the median age
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)

# Drop the Cabin column
train_df.drop(columns=['Cabin'], inplace=True)

# Fill missing values in Embarked with the most common port
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Verify that there are no more missing values
missing_values_after = train_df.isnull().sum()
missing_values_after


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [4]:
# Convert 'Sex' into numerical form
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})

# One-hot encode the 'Embarked' column
train_df = pd.get_dummies(train_df, columns=['Embarked'], drop_first=True)

# Create a new feature 'FamilySize'
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch']

# Drop the 'Name' and 'Ticket' columns as they are not useful for prediction
train_df.drop(columns=['Name', 'Ticket'], inplace=True)

# Display the first few rows of the preprocessed dataset
train_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,FamilySize
0,1,0,3,0,22.0,1,0,7.25,False,True,1
1,2,1,1,1,38.0,1,0,71.2833,False,False,1
2,3,1,3,1,26.0,0,0,7.925,False,True,0
3,4,1,1,1,35.0,1,0,53.1,False,True,1
4,5,0,3,0,35.0,0,0,8.05,False,True,0


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features (X) and target (y)
X = train_df.drop(columns=['PassengerId', 'Survived'])
y = train_df['Survived']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = rf_model.predict(X_val)

# Calculate the accuracy and classification report
accuracy = accuracy_score(y_val, y_val_pred)
classification_report_output = classification_report(y_val, y_val_pred)

accuracy, classification_report_output


(0.8212290502793296,
 '              precision    recall  f1-score   support\n\n           0       0.83      0.87      0.85       105\n           1       0.80      0.76      0.78        74\n\n    accuracy                           0.82       179\n   macro avg       0.82      0.81      0.81       179\nweighted avg       0.82      0.82      0.82       179\n')

In [7]:
# Load the test.csv file
test_df = pd.read_csv('test.csv')

# Display the first few rows of the test dataset
test_df.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [9]:
# Check for missing values in the test dataset
test_missing_values = test_df.isnull().sum()
test_missing_values


PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [10]:

# Convert 'Sex' into numerical form
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

# One-hot encode the 'Embarked' column
test_df = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True)

# Create a new feature 'FamilySize'
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch']

# Drop the 'Name' and 'Ticket' columns as they are not useful for prediction
test_df.drop(columns=['Name', 'Ticket'], inplace=True)

# Ensure all necessary columns are present in the test dataset
# Columns in train dataset excluding 'PassengerId' and 'Survived'
expected_columns = X.columns.tolist()
test_df = test_df.reindex(columns=expected_columns, fill_value=0)

# Verify that the test dataset has the correct columns
test_df.head()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,FamilySize
0,3,0,34.5,0,0,7.8292,True,False,0
1,3,1,47.0,1,0,7.0,False,True,1
2,2,0,62.0,0,0,9.6875,True,False,0
3,3,0,27.0,0,0,8.6625,False,True,0
4,3,1,22.0,1,1,12.2875,False,True,2


In [None]:
# Make predictions on the preprocessed test dataset
test_predictions = rf_model.predict(test_df)

# Create the submission dataframe
submission_df = pd.DataFrame({
    'PassengerId': test_df.index + 892,  # Adjusting index to match PassengerId in test.csv
    'Survived': test_predictions
})

# Save the submission dataframe to a CSV file
submission_file_path = 'titanic_submission.csv'
submission_df.to_csv(submission_file_path, index=False)

import ace_tools as tools; #This step is especially useful when working in environments where immediate data visualization is needed, such as in Jupyter notebooks or interactive Python environments. It helps in ensuring that the data looks correct before any further actions, like saving or submitting, are taken.
tools.display_dataframe_to_user(name="Titanic Submission DataFrame", dataframe=submission_df)

submission_file_path
