In [75]:
import pandas as pd

# Define the path to your CSV file
file_path = 'train.csv'

# Read the CSV file using pandas
df = pd.read_csv(file_path)
test_df = pd.read_csv("test.csv")
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [76]:
#Preprocess the data

def preprocess_csv(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return float(x.split(" ")[-1]) if x.split(" ")[-1] != "LINE" else 0.0
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)                     
    return df


preprocessed_train_df = preprocess_csv(df)
preprocessed_train_df['Age'].fillna(preprocessed_train_df['Age'].mean(), inplace=True)
preprocessed_train_df['Fare'].fillna(preprocessed_train_df['Fare'].mean(), inplace=True)
preprocessed_test_df = preprocess_csv(test_df)
preprocessed_test_df['Age'].fillna(preprocessed_test_df['Age'].mean(), inplace=True)
preprocessed_test_df['Fare'].fillna(preprocessed_test_df['Fare'].mean(), inplace=True)
preprocessed_train_df.head(10)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  preprocessed_train_df['Age'].fillna(preprocessed_train_df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  preprocessed_train_df['Fare'].fillna(preprocessed_train_df['Fare'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace met

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,21171.0,A/5
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,C85,C,17599.0,PC
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3101282.0,STON/O2.
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,C123,S,113803.0,NONE
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,,S,373450.0,NONE
5,6,0,3,Moran Mr James,male,29.699118,0,0,330877,8.4583,,Q,330877.0,NONE
6,7,0,1,McCarthy Mr Timothy J,male,54.0,0,0,17463,51.8625,E46,S,17463.0,NONE
7,8,0,3,Palsson Master Gosta Leonard,male,2.0,3,1,349909,21.075,,S,349909.0,NONE
8,9,1,3,Johnson Mrs Oscar W Elisabeth Vilhelmina Berg,female,27.0,0,2,347742,11.1333,,S,347742.0,NONE
9,10,1,2,Nasser Mrs Nicholas Adele Achem,female,14.0,1,0,237736,30.0708,,C,237736.0,NONE


In [77]:
input_features = list(preprocessed_train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")
input_features.remove("Cabin")

print(f"Input features: {input_features}")

Input features: ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Ticket_number', 'Ticket_item']


In [79]:
import torch
from collections import defaultdict
features = defaultdict(list)
label = []

for feature in input_features:
    data_values = preprocessed_train_df[feature]
    dataset = []
    for data in data_values:
        if feature in ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Ticket_number']:
            dataset.append(data)
        
        if feature == 'Sex':
            sex = 1.0 if data == 'male' else 0.0
            dataset.append(sex)
        
        if feature == 'Embarked':
            embarked = {'S': 0.0, 'C': 1.0, 'Q': 2.0}.get(data, 0.0)
            dataset.append(embarked)
    if len(dataset) > 0:
        preprocessed_train_df[feature] = torch.tensor(dataset, dtype=torch.float)
    else:
        if feature not in ['Name', 'Ticket_item']:
            print(feature)
            preprocessed_train_df[feature] = torch.tensor(data_values, dtype=torch.float)

features['Survived'] = torch.tensor(preprocessed_train_df['Survived'], dtype=torch.int)


labels_data = preprocessed_train_df['Survived']


feature_data = preprocessed_train_df.drop(columns=['Survived', 'Name', 'Ticket', 'Ticket_item', 'Cabin'])
labels = labels_data.unique()
feature_names = feature_data.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked', 'Ticket_number'],
      dtype='object')

In [95]:
tests = defaultdict(list)
for feature in input_features:
    data_values = preprocessed_test_df[feature]
    dataset = []
    for data in data_values:
        if feature in ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Ticket_number']:
            dataset.append(data)
        
        if feature == 'Sex':
            sex = 1.0 if data == 'male' else 0.0
            dataset.append(sex)
        
        if feature == 'Embarked':
            embarked = {'S': 0.0, 'C': 1.0, 'Q': 2.0}.get(data, 0.0)
            dataset.append(embarked)
    if len(dataset) > 0:
        preprocessed_test_df[feature] = torch.tensor(dataset, dtype=torch.float)
    else:
        if feature not in ['Name', 'Ticket_item']:
            print(feature)
            preprocessed_test_df[feature] = torch.tensor(data_values, dtype=torch.float)

tests['Survived'] = torch.tensor(preprocessed_train_df['Survived'], dtype=torch.int)
test_data = preprocessed_test_df.drop(columns=['Name', 'Ticket', 'Ticket_item', 'Cabin'])
test_data.head(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Ticket_number
0,892,3.0,0.0,34.5,0.0,0.0,7.8292,0.0,330911.0
1,893,3.0,0.0,47.0,1.0,0.0,7.0,0.0,363272.0
2,894,2.0,0.0,62.0,0.0,0.0,9.6875,0.0,240276.0
3,895,3.0,0.0,27.0,0.0,0.0,8.6625,0.0,315154.0
4,896,3.0,0.0,22.0,1.0,1.0,12.2875,0.0,3101298.0
5,897,3.0,0.0,14.0,0.0,0.0,9.225,0.0,7538.0
6,898,3.0,0.0,30.0,0.0,0.0,7.6292,0.0,330972.0
7,899,2.0,0.0,26.0,1.0,1.0,29.0,0.0,248738.0
8,900,3.0,0.0,18.0,0.0,0.0,7.2292,0.0,2657.0
9,901,3.0,0.0,21.0,2.0,0.0,24.15,0.0,48871.0


In [100]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth =5, random_state = 1)

clf.fit(feature_data, labels_data)

In [106]:
test_pred_decision_tree = clf.predict(test_data)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': test_pred_decision_tree})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
