### Titanic Survival Prediction

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('titanic.csv')
print(df.head())
print(df.describe())
print(df.info())

   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  
       PassengerId    Survived      Pclass         Age       SibSp  \
count   418.0000

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder


df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df['Fare'].fillna(df['Fare'].median(), inplace=True)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
imputer = SimpleImputer(strategy='median')
df['Age'] = imputer.fit_transform(df[['Age']])
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])
print(df.head())


   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  FamilySize
0         0       3    1  34.5      0      0   7.8292         1           1
1         1       3    0  47.0      1      0   7.0000         2           2
2         0       2    1  62.0      0      0   9.6875         1           1
3         0       3    1  27.0      0      0   8.6625         2           1
4         1       3    0  22.0      1      1  12.2875         2           3


In [4]:
from sklearn.model_selection import train_test_split

# Separate features and target variable
X = df.drop(['Survived'], axis=1)
y = df['Survived']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and validation sets
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)


(334, 8) (84, 8) (334,) (84,)


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create a pipeline with a scaler and a RandomForest classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=42))
])

# Define the parameter grid for GridSearch
param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

# Perform GridSearch with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters and estimator
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 100}


In [6]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the validation set
y_pred = best_model.predict(X_val)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))


Accuracy: 1.0
Confusion Matrix:
 [[50  0]
 [ 0 34]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [8]:
# Make predictions on the entire dataset
predictions = best_model.predict(X)

# Create a DataFrame with the results
results = pd.DataFrame({
    'PassengerId': pd.read_csv('titanic.csv')['PassengerId'],
    'Survived': predictions
})

# Save the results to a CSV file
results.to_csv('submission.csv', index=False)

# Display the first few rows of the submission file
print(results.head())


   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
