In [1]:
# TODO: Add Cross-Validation for kernels

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import MinMaxScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [3]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [4]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [5]:
train_data['hasAge'] = ~train_data.Age.isna()
test_data['hasAge'] = ~test_data.Age.isna()

# Initialize the MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

# Fill NaNs without using inplace=True
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())

# Fit and transform the data for training and testing datasets
train_data['Age_normalized'] = scaler.fit_transform(train_data[['Age']].values.reshape(-1, 1))
test_data['Age_normalized'] = scaler.transform(test_data[['Age']].values.reshape(-1, 1))

# Note: Use scaler.fit_transform() on the training data to fit the scaler and transform the data.
# Use scaler.transform() on the test data to apply the same scaling based on the training data.

In [6]:
# Initialize Fare Scaler
fareScaler = MinMaxScaler(feature_range=(0,1))

# Fill NaNs without using inplace=True
train_data['Fare'] = train_data['Fare'].fillna(train_data['Fare'].mean())
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mean())

# Fit and transform the data for training and testing
train_data['Fare'] = fareScaler.fit_transform(train_data[['Fare']].values.reshape(-1,1))
test_data['Fare'] = fareScaler.transform(test_data[['Fare']].values.reshape(-1,1))

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,hasAge,Age_normalized
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,0.014151,,S,True,0.271174
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,0.139136,C85,C,True,0.472229
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,0.015469,,S,True,0.321438
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,0.103644,C123,S,True,0.434531
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,0.015713,,S,True,0.434531


In [7]:
from sklearn import svm

y = train_data["Survived"]

#features = ["Pclass", "Sex", "SibSp", "Parch", "Age_normalized", "Embarked", "Fare", "hasAge"]
features = ["Pclass", "Sex", "SibSp", "Parch", "Age_normalized", "Fare"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = svm.SVC(C=10, kernel="rbf")
model.fit(X, y)
submissionPredictions = model.predict(X_test)

In [8]:
# CHECK FOR OVERFITTING
from sklearn.metrics import accuracy_score

training_predictions = model.predict(X)
training_truth = y.to_numpy()

accuracy = accuracy_score(training_truth, training_predictions)
print(f"Accuracy:{accuracy}")

Accuracy:0.8316498316498316


In [9]:
# CROSS-VALIDATION

def splitIntoGroups(data, numGroups):
    df_shuffled = data.sample(frac=1, random_state=21).reset_index(drop=True)
    groups = np.array_split(df_shuffled, 5)
    return groups

def singleHyperTrain(train, test):
    topAccuracy = {"accuracy": 0}
    optimalWeights = []
    # CValues = [.1, 1, 10, 100, 1000]
    CValues = [.1, 1, 10, 100, 200]
    #kernels = ["linear", "poly", "rbf", "sigmoid"]
    kernels = ["poly", "rbf"]
    for i in CValues:
        for j in kernels:
            y = train["Survived"]
            y_test = test["Survived"]
            # features = ["Pclass", "Sex", "SibSp", "Parch", "Age_normalized", "Embarked", "Fare", "hasAge"]
            features = ["Pclass", "Sex", "SibSp", "Parch", "Age_normalized", "Fare"]
            X = pd.get_dummies(train[features])
            X_test = pd.get_dummies(test[features])
            testModel = svm.SVC(C=i, kernel=j)
            testModel.fit(X, y)
            predictions = model.predict(X_test)
            accuracy = accuracy_score(y_test.to_numpy(), predictions)
            if accuracy > topAccuracy["accuracy"]:
                topAccuracy = {"accuracy": accuracy, "C": i, "kernel": j}
            print(f"{accuracy} with C: {i}, kernel: {j}")
    print(f"top accuracy: {topAccuracy['accuracy']} with C = {topAccuracy['C']} and {topAccuracy['kernel']} kernel")
        
def crossTrain(data):
    for i in range(0, len(data)):
        trainGroup = pd.concat([groupsOfData[j] for j in range(len(groupsOfData)) if j != i])
        singleHyperTrain(trainGroup, groupsOfData[i])
    
        
# groupsOfData = splitIntoGroups(train_data, 5)
# crossTrain(groupsOfData)

In [10]:
# CREATE PREDICTIONS
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': submissionPredictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
