## Test Data Set

## Working with test.csv to test the model build with train.csv

In [2]:
#import libraries
import pandas as pd
import numpy as np
import joblib


In [4]:
#loading data
dftest = pd.read_csv(r"C:\Users\Pulani\OneDrive\Desktop\Kaggle\Titanic data set\Titanic\test.csv")
print ("Data loaded successfully!")

# Display the first few rows of the dataset
dftest.head()

Data loaded successfully!


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Data Preprocessing as same as for the training data

In [9]:
# Check the data types and missing values
dftest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


### Fill missing values in 'Age' column

In [20]:
import joblib

# Load the median age from the saved file
median_age = joblib.load('median_age.pkl')

# Fill missing values in the 'Age' column of the test dataset
dftest['Age'] = dftest['Age'].fillna(median_age)

# Verify the changes
print(dftest['Age'].describe())


count    418.000000
mean      29.805024
std       12.667969
min        0.170000
25%       23.000000
50%       28.000000
75%       35.750000
max       76.000000
Name: Age, dtype: float64


### Handling missing values in Cabin column


In [25]:
# Drop the 'Cabin' column from the test dataset
dftest.drop('Cabin', axis=1, inplace=True)

# Verify the changes
print(dftest.columns)


Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked'],
      dtype='object')


### Handling missing values in Embarked column

- In Embarked column there are only 2 missing values, so I'm going to fill them with Mode

In [28]:
# Fill missing 'Embarked' values in the test dataset with the mode ('S')
dftest['Embarked'] = dftest['Embarked'].fillna('S')

# Verify there are no missing values
print(dftest['Embarked'].isnull().sum())

# Check unique values and their counts (optional, for verification)
print(dftest['Embarked'].value_counts())


0
Embarked
S    270
C    102
Q     46
Name: count, dtype: int64


### Rename the updated test DataFrame

In [31]:
# Rename the updated test DataFrame
dftest_cleaned = dftest.copy()

# Verify the cleaned test data

# Check for missing values
print(dftest_cleaned.isnull().sum())

# Check the first few rows
dftest_cleaned.head()


PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Embarked       0
dtype: int64


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


## Feature Engineering

### Title Extraction and One-Hot Encoding in Test Dataset

In [35]:
### Title Extraction
# Extract title from the Name column
dftest_cleaned['Title'] = dftest_cleaned['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

# Group rare titles into a single category 'Rare'
dftest_cleaned['Title'] = dftest_cleaned['Title'].replace(['Dr', 'Rev', 'Mlle', 'Major', 'Col', 
                                                           'Countess', 'Capt', 'Ms', 'Sir', 
                                                           'Lady', 'Mme', 'Don', 'Jonkheer'], 'Rare')

# Verify the unique titles
print(dftest_cleaned['Title'].value_counts())

### One-Hot Encoding
# Perform one-hot encoding for the Title column
dftest_cleaned = pd.get_dummies(dftest_cleaned, columns=['Title'], drop_first=True, dtype=int)

# Check the updated DataFrame
dftest_cleaned.head()


Title
Mr        240
Miss       78
Mrs        72
Master     21
Rare        6
Dona        1
Name: count, dtype: int64


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,0,0,1,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,0,0,0,1,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,0,0,1,0,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,0,0,1,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,0,0,0,1,0


### Converting Sex Column in the Test Dataset

In [38]:
# Convert 'Sex' to numerical: male = 0, female = 1
dftest_cleaned['Sex'] = dftest_cleaned['Sex'].map({'male': 0, 'female': 1})

# Verify the changes
print(dftest_cleaned[['Sex']].head())


   Sex
0    0
1    1
2    0
3    0
4    1


### One-Hot Encoding the Embarked Column in Test Dataset

In [41]:
# Perform one-hot encoding for the 'Embarked' column
dftest_cleaned = pd.get_dummies(dftest_cleaned, columns=['Embarked'], drop_first=True, dtype=int)

# Verify the changes
dftest_cleaned.filter(like='Embarked_').head()


Unnamed: 0,Embarked_Q,Embarked_S
0,1,0
1,0,1
2,1,0
3,0,1
4,0,1


### Create Age Groups & One-Hot encoding for AgeGroup


In [43]:
# Define age bins and labels (same as in the training dataset)
bins = [0, 12, 19, 35, 60, 80]  # Define bin edges
labels = ['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']  # Define bin labels

# Create the AgeGroup column in the test dataset
dftest_cleaned['AgeGroup'] = pd.cut(dftest_cleaned['Age'], bins=bins, labels=labels)

# Verify the new column
print(dftest_cleaned[['Age', 'AgeGroup']].head(5))

# Perform one-hot encoding on AgeGroup
dftest_cleaned = pd.get_dummies(dftest_cleaned, columns=['AgeGroup'], drop_first=True, dtype=int)

# Verify the updated DataFrame
dftest_cleaned.head()


    Age     AgeGroup
0  34.5  Young Adult
1  47.0        Adult
2  62.0       Senior
3  27.0  Young Adult
4  22.0  Young Adult
   PassengerId  Pclass                                          Name  Sex  \
0          892       3                              Kelly, Mr. James    0   
1          893       3              Wilkes, Mrs. James (Ellen Needs)    1   
2          894       2                     Myles, Mr. Thomas Francis    0   
3          895       3                              Wirz, Mr. Albert    0   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)    1   

    Age  SibSp  Parch   Ticket     Fare  Title_Master  Title_Miss  Title_Mr  \
0  34.5      0      0   330911   7.8292             0           0         1   
1  47.0      1      0   363272   7.0000             0           0         0   
2  62.0      0      0   240276   9.6875             0           0         1   
3  27.0      0      0   315154   8.6625             0           0         1   
4  22.0      1  

In [66]:
# Verify the updated DataFrame
dftest_cleaned.head()


Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Embarked_Q,Embarked_S,AgeGroup_Teenager,AgeGroup_Young Adult,AgeGroup_Adult,AgeGroup_Senior
0,3,0,0,0,7.8292,0,0,1,0,0,1,0,0,1,0,0
1,3,1,1,0,7.0,0,0,0,1,0,0,1,0,0,1,0
2,2,0,0,0,9.6875,0,0,1,0,0,1,0,0,0,0,1
3,3,0,0,0,8.6625,0,0,1,0,0,0,1,0,1,0,0
4,3,1,1,1,12.2875,0,0,0,1,0,0,1,0,1,0,0


In [45]:
#to check the information of columns
dftest_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PassengerId           418 non-null    int64  
 1   Pclass                418 non-null    int64  
 2   Name                  418 non-null    object 
 3   Sex                   418 non-null    int64  
 4   Age                   418 non-null    float64
 5   SibSp                 418 non-null    int64  
 6   Parch                 418 non-null    int64  
 7   Ticket                418 non-null    object 
 8   Fare                  417 non-null    float64
 9   Title_Master          418 non-null    int32  
 10  Title_Miss            418 non-null    int32  
 11  Title_Mr              418 non-null    int32  
 12  Title_Mrs             418 non-null    int32  
 13  Title_Rare            418 non-null    int32  
 14  Embarked_Q            418 non-null    int32  
 15  Embarked_S            4

### Drop other irrelevant columns in Test Data Set

In [48]:
# Drop irrelevant columns in the test dataset
dftest_cleaned.drop(columns=['PassengerId', 'Name', 'Ticket', 'Age'], inplace=True)

# Verify the updated DataFrame
print(dftest_cleaned.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Pclass                418 non-null    int64  
 1   Sex                   418 non-null    int64  
 2   SibSp                 418 non-null    int64  
 3   Parch                 418 non-null    int64  
 4   Fare                  417 non-null    float64
 5   Title_Master          418 non-null    int32  
 6   Title_Miss            418 non-null    int32  
 7   Title_Mr              418 non-null    int32  
 8   Title_Mrs             418 non-null    int32  
 9   Title_Rare            418 non-null    int32  
 10  Embarked_Q            418 non-null    int32  
 11  Embarked_S            418 non-null    int32  
 12  AgeGroup_Teenager     418 non-null    int32  
 13  AgeGroup_Young Adult  418 non-null    int32  
 14  AgeGroup_Adult        418 non-null    int32  
 15  AgeGroup_Senior       4

In [52]:
dftest_cleaned.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Embarked_Q,Embarked_S,AgeGroup_Teenager,AgeGroup_Young Adult,AgeGroup_Adult,AgeGroup_Senior
0,3,0,0,0,7.8292,0,0,1,0,0,1,0,0,1,0,0
1,3,1,1,0,7.0,0,0,0,1,0,0,1,0,0,1,0
2,2,0,0,0,9.6875,0,0,1,0,0,1,0,0,0,0,1
3,3,0,0,0,8.6625,0,0,1,0,0,0,1,0,1,0,0
4,3,1,1,1,12.2875,0,0,0,1,0,0,1,0,1,0,0


In [62]:
import joblib

# Load the column names
training_columns = joblib.load('training_columns.pkl')
print("Training column names loaded:", training_columns)


Training column names loaded: ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare', 'Embarked_Q', 'Embarked_S', 'AgeGroup_Teenager', 'AgeGroup_Young Adult', 'AgeGroup_Adult', 'AgeGroup_Senior']


In [64]:
print(dftest_cleaned.columns)


Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Title_Rare', 'Embarked_Q', 'Embarked_S',
       'AgeGroup_Teenager', 'AgeGroup_Young Adult', 'AgeGroup_Adult',
       'AgeGroup_Senior'],
      dtype='object')


### Identify the Missing and Extra Columns

- Compare the one-hot encoded columns in the test dataset (dftest_cleaned) with those in the training dataset (dftraining_cleaned).

#### Load the Column Names in the Test Notebook

In [71]:
import joblib

# Load the saved training column names
training_columns = joblib.load('training_columns.pkl')
print("Training column names loaded successfully:", training_columns)


Training column names loaded successfully: ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare', 'Embarked_Q', 'Embarked_S', 'AgeGroup_Teenager', 'AgeGroup_Young Adult', 'AgeGroup_Adult', 'AgeGroup_Senior']


#### Identify Missing and Extra Columns

In [75]:
# Identify missing columns (present in training but not in test)
missing_cols = set(training_columns) - set(dftest_cleaned.columns)
print("Missing columns in the test dataset:", missing_cols)

# Identify extra columns (present in test but not in training)
extra_cols = set(dftest_cleaned.columns) - set(training_columns)
print("Extra columns in the test dataset:", extra_cols)


Missing columns in the test dataset: {'Survived'}
Extra columns in the test dataset: {'Title_Master'}


- Missing Column in Test Dataset: **'Survived'**
    - This column is expected in the training data as the target variable but should not exist in the test dataset. No action is needed here since Survived is not relevant for predictions.

- Extra Column in Test Dataset: **'Title_Master'**
    - This column exists in the test dataset but not in the training dataset. We need to remove it to ensure alignment.


### Handle the Extra Column (Title_Master)

In [81]:
# Remove the extra column 'Title_Master' from the test dataset
dftest_cleaned.drop(columns=['Title_Master'], inplace=True)

# Verify the columns after removal
print("Updated test dataset columns:", dftest_cleaned.columns)


Updated test dataset columns: Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'Title_Rare', 'Embarked_Q', 'Embarked_S',
       'AgeGroup_Teenager', 'AgeGroup_Young Adult', 'AgeGroup_Adult',
       'AgeGroup_Senior'],
      dtype='object')


### Reorder the Columns

The columns in the test dataset (dftest_cleaned) should be in the same order as the training dataset, excluding the 'Survived' column (which is the target variable and not part of the test dataset).

In [85]:
# Reorder the columns in the test dataset to match the training dataset (excluding 'Survived')
dftest_cleaned = dftest_cleaned[[col for col in training_columns if col != 'Survived']]

# Verify the column order
print("Reordered test dataset columns:", dftest_cleaned.columns)


Reordered test dataset columns: Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'Title_Rare', 'Embarked_Q', 'Embarked_S',
       'AgeGroup_Teenager', 'AgeGroup_Young Adult', 'AgeGroup_Adult',
       'AgeGroup_Senior'],
      dtype='object')


## Make Predictions

### Load the trained model

(Make sure have already saved trained model (e.g., tuned_logistic_regression_model.pkl) in the same directory as notebook.

In [89]:
import joblib

# Load the trained Logistic Regression model
loaded_model = joblib.load('tuned_logistic_regression_model.pkl')

print("Model loaded successfully!")


Model loaded successfully!


### Make Predictions on the Test Dataset

Now that the model is loaded, we’ll use it to predict the Survived column for the cleaned test dataset (dftest_cleaned).

In [96]:
# Check for missing values in the test dataset
missing_values = dftest_cleaned.isnull().sum()
print("Columns with missing values:")
print(missing_values[missing_values > 0])


Columns with missing values:
Fare    1
dtype: int64


### Handle Missing Values in Fare

    There was an error , Fare column has 1 missing value in test data.

In [103]:
# Fill the missing value in the 'Fare' column with the median of the column
dftest_cleaned['Fare'] = dftest_cleaned['Fare'].fillna(dftest_cleaned['Fare'].median())

# Verify there are no more missing values
print("Missing values in the test dataset after filling:")
print(dftest_cleaned.isnull().sum().sum())  # Should print 0



Missing values in the test dataset after filling:
0


### Re-Try Prediction

In [106]:
# Make predictions on the cleaned test dataset
predictions = loaded_model.predict(dftest_cleaned)

# Verify predictions
print("Predictions for the first 10 passengers in the test dataset:")
print(predictions[:10])  # Display the first 10 predictions


Predictions for the first 10 passengers in the test dataset:
[0 1 0 0 1 0 1 0 1 0]


## Submission File Creation

**dftest_cleaned** vs. **dftest:**

- The PassengerId column was dropped only from dftest_cleaned (preprocessed DataFrame).
- The original dftest still contains the PassengerId column because it wasn’t modified.

- No Need to Reload:
- Since dftest is still in memory, there’s no need to reload the data from the file.

In [110]:
# Create the submission DataFrame using PassengerId from the original dftest
submission = pd.DataFrame({
    'PassengerId': dftest['PassengerId'],  # Use PassengerId from the original dftest
    'Survived': predictions                # Use predictions from the model
})

# Save the submission file with the specified name
submission.to_csv('gender_submission.csv', index=False)
print("Submission file created: 'gender_submission.csv'")


Submission file created: 'gender_submission.csv'


### Locate the Current Working Directory

In [113]:
import os

# Print the current working directory
print("Current working directory:", os.getcwd())


Current working directory: C:\Users\Pulani


In [115]:
submission.to_csv(r'C:\Users\Pulani\Documents\gender_submission.csv', index=False)
