In [21]:
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [22]:
# Load the data
train_data = pd.read_csv('train (1).csv')
test_data = pd.read_csv('test.csv')


Training Data: Contains features such as Passenger ID, Survival status (0 = No, 1 = Yes), Passenger class (Pclass), Name, Sex, Age, Number of siblings/spouses aboard (SibSp), Number of parents/children aboard (Parch), Ticket number, Fare, Cabin number, and Embarkation port.
Test Data: Has the same features as the training data, except for the Survival status, which is what we aim to predict.

1 . Data Exploration:

In [23]:
# Display the first few rows of each dataset to understand their structure
train_data_head = train_data.head()
test_data_head = test_data.head()


train_data_head, test_data_head


(   PassengerId  Survived  Pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3         1       3   
 3            4         1       1   
 4            5         0       3   
 
                                                 Name     Sex   Age  SibSp  \
 0                            Braund, Mr. Owen Harris    male  22.0      1   
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
 2                             Heikkinen, Miss. Laina  female  26.0      0   
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
 4                           Allen, Mr. William Henry    male  35.0      0   
 
    Parch            Ticket     Fare Cabin Embarked  
 0      0         A/5 21171   7.2500   NaN        S  
 1      0          PC 17599  71.2833   C85        C  
 2      0  STON/O2. 3101282   7.9250   NaN        S  
 3      0            113803  53.1000  C123        S  
 4      0            373450   8.0500

Data Exploration Findings:
Missing Values:

Age: 177 missing values.
Cabin: 687 missing values, which is a significant portion of the dataset.
Embarked: 2 missing values.
Numerical Feature Summary:

Age: Ranges from 0.42 to 80 years. The distribution appears reasonable without obvious outliers.
SibSp (Siblings/Spouses): Most passengers have 0 or 1 sibling/spouse aboard. The maximum is 8, which is high but plausible.
Parch (Parents/Children): Similar to SibSp, most passengers have 0 parents/children aboard. The maximum is 6.
Fare: Ranges from 0 to 512.33. The maximum fare is notably high, indicating a potential outlier or simply variation in ticket prices.

In [24]:
# Checking for missing values in the training data
missing_values_train = train_data.isnull().sum()

# Summary statistics for numerical features to identify outliers
numerical_summary_train = train_data.describe()

missing_values_train, numerical_summary_train


(PassengerId      0
 Survived         0
 Pclass           0
 Name             0
 Sex              0
 Age            177
 SibSp            0
 Parch            0
 Ticket           0
 Fare             0
 Cabin          687
 Embarked         2
 dtype: int64,
        PassengerId    Survived      Pclass         Age       SibSp  \
 count   891.000000  891.000000  891.000000  714.000000  891.000000   
 mean    446.000000    0.383838    2.308642   29.699118    0.523008   
 std     257.353842    0.486592    0.836071   14.526497    1.102743   
 min       1.000000    0.000000    1.000000    0.420000    0.000000   
 25%     223.500000    0.000000    2.000000   20.125000    0.000000   
 50%     446.000000    0.000000    3.000000   28.000000    0.000000   
 75%     668.500000    1.000000    3.000000   38.000000    1.000000   
 max     891.000000    1.000000    3.000000   80.000000    8.000000   
 
             Parch        Fare  
 count  891.000000  891.000000  
 mean     0.381594   32.204208  
 std 

In [25]:
2. Data Preprocessing:

SyntaxError: invalid syntax (1754240165.py, line 1)

In [26]:
# Handling missing values
# For Age, use median to fill missing values
# For Cabin, transform it into a binary feature indicating whether a cabin was assigned or not
# For Embarked, fill missing values with the most common port

# Filling missing Age values with median
age_imputer = SimpleImputer(strategy='median')
train_data['Age'] = age_imputer.fit_transform(train_data[['Age']])

# Transforming Cabin into a binary feature
train_data['CabinAssigned'] = train_data['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)
test_data['CabinAssigned'] = test_data['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)

# Filling missing Embarked values with the most common port
common_port = train_data['Embarked'].mode()[0]
train_data['Embarked'].fillna(common_port, inplace=True)

# Convert categorical variables into numerical format
# Using One-Hot Encoding for Sex and Embarked
categorical_features = ['Sex', 'Embarked']
one_hot_encoder = OneHotEncoder()

# Apply OneHotEncoder to training data
preprocessor = ColumnTransformer(transformers=[
    ('cat', one_hot_encoder, categorical_features)
], remainder='passthrough')

train_data_transformed = preprocessor.fit_transform(train_data)

# Converting the transformed data back into a DataFrame for better readability
# Getting the new column names from the one-hot encoder
new_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
new_columns = list(new_columns) + [col for col in train_data.columns if col not in categorical_features]

# Creating the new DataFrame
train_data_preprocessed = pd.DataFrame(train_data_transformed, columns=new_columns)

# Checking the transformed data
train_data_preprocessed.head()


Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,CabinAssigned
0,0.0,1.0,0.0,0.0,1.0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0
1,1.0,0.0,1.0,0.0,0.0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1
2,1.0,0.0,0.0,0.0,1.0,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0
3,1.0,0.0,0.0,0.0,1.0,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1
4,0.0,1.0,0.0,0.0,1.0,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0


The data preprocessing is complete. Here's what was done:

Missing Age values were filled with the median age.
The Cabin feature was transformed into a binary feature CabinAssigned, indicating whether a cabin was assigned (1) or not (0).
Missing Embarked values were filled with the most common port.
Categorical variables (Sex, Embarked) were converted into numerical format using one-hot encoding.

3. Model Building:

In [27]:
# Splitting the data into features and target label
X = train_data_preprocessed.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
y = train_data_preprocessed['Survived'].astype('int')

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Building and training the models
# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Decision Tree
dec_tree = DecisionTreeClassifier(random_state=42)
dec_tree.fit(X_train, y_train)

# Random Forest
rand_forest = RandomForestClassifier(random_state=42)
rand_forest.fit(X_train, y_train)

# The models are now trained. Next, we'll evaluate their performance.
(log_reg, dec_tree, rand_forest)  # Indicating that the models have been successfully created and trained.


(LogisticRegression(max_iter=1000),
 DecisionTreeClassifier(random_state=42),
 RandomForestClassifier(random_state=42))

4. Model Evaluation:


Now, let's evaluate the performance of these models. We'll use metrics like accuracy, precision, recall, and F1-score to assess their performance on the validation set. This will help us understand how well each model is performing and guide us in choosing the best model for our use case.

In [28]:
# Function to calculate and return the performance metrics
def evaluate_model(model, X_val, y_val):
    # Making predictions on the validation set
    y_pred = model.predict(X_val)

    # Calculating the metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    return accuracy, precision, recall, f1

# Evaluating each model
log_reg_metrics = evaluate_model(log_reg, X_val, y_val)
dec_tree_metrics = evaluate_model(dec_tree, X_val, y_val)
rand_forest_metrics = evaluate_model(rand_forest, X_val, y_val)

log_reg_metrics, dec_tree_metrics, rand_forest_metrics

((0.8212290502793296, 0.8, 0.7567567567567568, 0.7777777777777778),
 (0.7932960893854749,
  0.7228915662650602,
  0.8108108108108109,
  0.7643312101910827),
 (0.8156424581005587,
  0.7887323943661971,
  0.7567567567567568,
  0.7724137931034482))

Model Evaluation Results
The performance metrics for each model on the validation set are as follows:

Logistic Regression:
Accuracy: 82.12%
Precision: 80.00%
Recall: 75.68%
F1-score: 77.78%

Decision Tree:
Accuracy: 79.33%
Precision: 72.29%
Recall: 81.08%
F1-score: 76.43%

Random Forest:
Accuracy: 81.56%
Precision: 78.87%
Recall: 75.68%
F1-score: 77.24%

Observations:
Logistic Regression shows the highest accuracy and precision among the three models. It has a balanced recall and precision, indicated by its F1-score.
Decision Tree has a higher recall but lower precision and accuracy compared to the other models.
Random Forest shows a good balance between precision and recall, with accuracy slightly lower than Logistic Regression but higher than the Decision Tree.

5. Model Tuning:
- Experiment with hyperparameter tuning to improve model performance.

In [30]:
# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

# Best parameters
print('Best Parameters:', CV_rfc.best_params_)

# Evaluating the tuned model
print('Tuned Random Forest Classification Report:')
print(classification_report(y_val, CV_rfc.predict(X_val)))

150 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\IFEANYI\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\IFEANYI\anaconda3\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\IFEANYI\anaconda3\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\IFEANYI\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise Inval

Best Parameters: {'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 100}
Tuned Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       105
           1       0.80      0.70      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

