<a href="https://www.kaggle.com/code/somewhatjustin/titanic?scriptVersionId=164404534" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'titanic:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F3136%2F26502%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240223%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240223T153438Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D99256e9948a19ca191331941e6051deceb1ec9dd65fb2cea8e3cee81e523ee920cdb2ac4cc1488089bc7e4052df904195c73e3bd0d341636c44f283f397772a3bc4e43467cc48b1afa433a191952197679c7c83442df3fd92cad85101ab313472c92426c3e47710580664899f3c900e06f052790398345fd633ba3c6b80b1fa2267c04380a9b5c823a74ae91e6d33213b604be6d2e44eb8affd3a9289405fd88f50f5633083c3730adec09f8fbe333e072c96fd54f4af2d10dade48ebf0094d32e423b7eefab6287bb48549bd4e73027a92cfa1068f5461a0c82af4bcc04af26ff26fc6f1a76a3ee16693978b482c9f151c84c639685a70e45eaab326be28009'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading titanic, 34877 bytes compressed
Data source import complete.


In [2]:
'''
TODO: 
- Make age buckets
- make sex binary
- combine parch and sib
- check out GridSearchCV
'''

'\nTODO: \n- Make age buckets\n- make sex binary\n- combine parch and sib\n- check out GridSearchCV\n'

In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler

In [4]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [5]:
# Normalize Age
scaler = MinMaxScaler(feature_range=(0, 1))

# Fill NaNs without using inplace=True
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())

# Fit and transform the data for training and testing datasets
train_data['Age_normalized'] = scaler.fit_transform(train_data[['Age']].values.reshape(-1, 1))
test_data['Age_normalized'] = scaler.transform(test_data[['Age']].values.reshape(-1, 1))

# Note: Use scaler.fit_transform() on the training data to fit the scaler and transform the data.
# Use scaler.transform() on the test data to apply the same scaling based on the training data.

In [6]:
# Fare Normalization
fareScaler = MinMaxScaler(feature_range=(0,1))

# Fill NaNs without using inplace=True
train_data['Fare'] = train_data['Fare'].fillna(train_data['Fare'].mean())
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mean())

# Fit and transform the data for training and testing
train_data['Fare'] = fareScaler.fit_transform(train_data[['Fare']].values.reshape(-1,1))
test_data['Fare'] = fareScaler.transform(test_data[['Fare']].values.reshape(-1,1))

In [7]:
# Passenger Class Normalization

class_mapping = {1: "First", 2: "Second", 3: "Third"}
train_data['Pclass'] = train_data['Pclass'].map(class_mapping)
test_data['Pclass'] = test_data['Pclass'].map(class_mapping)

In [8]:
# Make sex binary

# 0: female
# 1: male

sex_mapping = {"male": 0, "female": 1}
train_data['Sex'] = train_data['Sex'].map(sex_mapping)
test_data['Sex'] = test_data['Sex'].map(sex_mapping)

In [9]:
# Cabin Classifier

def cabin_category(cabin):
    # Check for missing cabin information
    if pd.isnull(cabin):
        return 'Other'
    else:
        # Search for specific letters in the cabin string
        for letter in ['A', 'B', 'C', 'D', 'E', 'F']:
            if letter in cabin:
                return letter
        # Return 'Other' if none of the specific letters are found
        return 'Other'

# Apply the function to create a new 'Cabin_Category' column
train_data['Cabin_Category'] = train_data['Cabin'].apply(cabin_category)
test_data['Cabin_Category'] = test_data['Cabin'].apply(cabin_category)

In [10]:
# Analysis

# Now create a pivot table with the new 'Cabin_Category'
pivot_table = pd.crosstab(index=train_data['Survived'], columns=train_data['Cabin_Category'])

# Display the pivot table
print(pivot_table)

Cabin_Category  A   B   C   D   E  F  Other
Survived                                   
0               8  12  24   8   8  5    484
1               7  35  35  25  25  7    208


In [11]:
y = train_data["Survived"]

features = ["Sex", "Age_normalized", "Fare"]
#features = ["Pclass", "Sex", "SibSp", "Parch", "Age_normalized", "Fare"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = GradientBoostingClassifier(n_estimators=50, learning_rate=.5, max_depth=3).fit(X, y)
submissionPredictions = model.predict(X_test)

In [12]:
# CHECK FOR OVERFITTING
from sklearn.metrics import accuracy_score

training_predictions = model.predict(X)
training_truth = y.to_numpy()

accuracy = accuracy_score(training_truth, training_predictions)
print(f"Accuracy:{accuracy}")

Accuracy:0.9281705948372615


In [13]:
# CROSS-VALIDATION

def splitIntoGroups(data, numGroups):
    df_shuffled = data.sample(frac=1, random_state=21).reset_index(drop=True)
    groups = np.array_split(df_shuffled, 5)
    return groups

def singleHyperTrain(train, test):
    topAccuracy = {"accuracy": 0}
    optimalWeights = []
    estimators = [10, 25, 50]
    max_depth = [2, 3, 4]
    learning_rate = [.1, .25, .5, .75, 1]
    for i in estimators:
        for j in max_depth:
            for k in learning_rate:
              y = train["Survived"]
              y_test = test["Survived"]
              # features = ["Pclass", "Sex", "SibSp", "Parch", "Age_normalized", "Embarked", "Fare", "hasAge", "Cabin_Category"]
              features = ["Sex", "Age_normalized", "Fare"]
              X = pd.get_dummies(train[features])
              X_test = pd.get_dummies(test[features])
              testModel = GradientBoostingClassifier(n_estimators=i, learning_rate=k, max_depth=j).fit(X, y)
              predictions = testModel.predict(X_test)
              accuracy = accuracy_score(y_test.to_numpy(), predictions)
              if accuracy > topAccuracy["accuracy"]:
                  topAccuracy = {"accuracy": accuracy, "estimators": i, "max_depth": j, "learning_rate": k}
              print(f"{accuracy} with est: {i}, max depth: {j}, learning rate: {k}")

    print(f"top accuracy: {topAccuracy['accuracy']} with estimators = {topAccuracy['estimators']} and {topAccuracy['max_depth']} max depth with learning rate of {topAccuracy['learning_rate']}")

def crossTrain(data):
    for i in range(0, len(data)):
        trainGroup = pd.concat([groupsOfData[j] for j in range(len(groupsOfData)) if j != i])
        singleHyperTrain(trainGroup, groupsOfData[i])

groupsOfData = splitIntoGroups(train_data, 5)

# Uncomment to do crosstraining
#crossTrain(groupsOfData)

  return bound(*args, **kwds)


In [14]:
# CREATE PREDICTIONS
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': submissionPredictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
