In [1]:
from fastai.tabular.all import *
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df.drop('PassengerId', axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
dls = TabularDataLoaders.from_csv('/kaggle/input/titanic/train.csv', y_names="Survived", y_block = CategoryBlock,
    cat_names = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Cabin', 'Embarked', 'Ticket'],
    cont_names = ['Age', 'Fare', 'SibSp', 'Parch'],
    procs = [Categorify, FillMissing, Normalize])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)


In [3]:
dls.show_batch()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Cabin,Embarked,Ticket,Age_na,Age,Fare,SibSp,Parch,Survived
0,593,3,"Elsbury, Mr. William James",male,#na#,S,A/5 3902,False,47.0,7.250001,-1.072517e-09,-9.935691e-09,0
1,362,2,"del Carlo, Mr. Sebastiano",male,#na#,C,SC/PARIS 2167,False,29.0,27.7208,1.0,-9.935691e-09,0
2,334,3,"Vander Planke, Mr. Leo Edmondus",male,#na#,S,345764,False,16.000001,18.0,2.0,-9.935691e-09,0
3,231,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,C83,S,36973,False,35.0,83.475,1.0,-9.935691e-09,1
4,508,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,#na#,S,111427,True,28.5,26.549999,-1.072517e-09,-9.935691e-09,1
5,585,3,"Paulner, Mr. Uscher",male,#na#,C,3411,True,28.5,8.7125,-1.072517e-09,-9.935691e-09,0
6,455,3,"Peduzzi, Mr. Joseph",male,#na#,S,A/5 2817,True,28.5,8.05,-1.072517e-09,-9.935691e-09,0
7,258,1,"Cherry, Miss. Gladys",female,B77,S,110152,False,30.0,86.499999,-1.072517e-09,-9.935691e-09,1
8,706,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,#na#,S,250655,False,39.0,26.0,-1.072517e-09,-9.935691e-09,0
9,761,3,"Garfirth, Mr. John",male,#na#,S,358585,True,28.5,14.499999,-1.072517e-09,-9.935691e-09,0


In [4]:
learn = tabular_learner(dls, metrics=accuracy)

In [5]:
learn.fit_one_cycle(3)

epoch,train_loss,valid_loss,accuracy,time
0,0.675051,0.702463,0.393258,00:00
1,0.58224,0.691855,0.438202,00:00
2,0.480987,0.678066,0.466292,00:00


In [6]:
learn.show_results()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Cabin,Embarked,Ticket,Age_na,Age,Fare,SibSp,Parch,Survived,Survived_pred
0,0.0,3.0,51.0,1.0,0.0,1.0,209.0,1.0,-1.274893,-0.504297,-0.462629,-0.462981,1.0,1.0
1,0.0,1.0,191.0,1.0,82.0,1.0,597.0,1.0,0.624764,0.819542,0.482513,-0.462981,1.0,1.0
2,0.0,3.0,68.0,2.0,0.0,3.0,275.0,1.0,-0.591017,-0.487333,-0.462629,-0.462981,0.0,1.0
3,0.0,3.0,534.0,2.0,0.0,2.0,453.0,2.0,-0.097106,-0.493533,-0.462629,-0.462981,0.0,1.0
4,0.0,1.0,851.0,2.0,0.0,3.0,49.0,1.0,2.296463,-0.104984,-0.462629,-0.462981,0.0,0.0
5,0.0,3.0,23.0,2.0,0.0,3.0,334.0,1.0,-1.95877,-0.00733,3.317936,2.086098,0.0,0.0
6,0.0,3.0,561.0,2.0,0.0,1.0,190.0,2.0,-0.097106,-0.338614,0.482513,0.811558,1.0,1.0
7,0.0,3.0,294.0,2.0,0.0,3.0,424.0,1.0,-1.578839,-0.229506,-0.462629,2.086098,1.0,1.0
8,0.0,3.0,194.0,1.0,0.0,3.0,509.0,1.0,-0.591017,-0.436352,-0.462629,-0.462981,0.0,1.0


Lets try now by using pandas

In [7]:
splits = RandomSplitter(valid_pct=0.2)(range_of(df))

In [8]:
to = TabularPandas(df, procs=[Categorify, FillMissing,Normalize],
                   cat_names = ['Pclass', 'Name', 'Sex', 'Cabin', 'Embarked', 'Ticket'],
                   cont_names = ['Age', 'Fare', 'SibSp', 'Parch'],
                   y_names='Survived',
                   splits=splits)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)


In [9]:
to.xs.iloc[:2]

Unnamed: 0,Pclass,Name,Sex,Cabin,Embarked,Ticket,Age_na,Age,Fare,SibSp,Parch
416,2,227,1,0,3,225,1,0.366,0.015361,0.403622,0.703406
723,2,369,2,0,3,162,1,1.579349,-0.406391,-0.479147,-0.487873


In [10]:
dls2 = to.dataloaders(bs=64)

In [11]:
dls2.show_batch()

Unnamed: 0,Pclass,Name,Sex,Cabin,Embarked,Ticket,Age_na,Age,Fare,SibSp,Parch,Survived
0,2,"Butler, Mr. Reginald Fenton",male,#na#,S,234686,False,25.0,12.999999,-4.410101e-09,5.139476e-09,0.0
1,3,"Jalsevac, Mr. Ivan",male,#na#,C,349240,False,29.0,7.8958,-4.410101e-09,5.139476e-09,1.0
2,3,"Stranden, Mr. Juho",male,#na#,S,STON/O 2. 3101288,False,31.0,7.925001,-4.410101e-09,5.139476e-09,1.0
3,3,"Goodwin, Master. Sidney Leonard",male,#na#,S,CA 2144,False,1.0,46.900001,5.0,2.0,0.0
4,1,"Lines, Miss. Mary Conover",female,D28,S,PC 17592,False,16.0,39.400001,-4.410101e-09,1.0,1.0
5,3,"Ford, Miss. Robina Maggie ""Ruby""",female,#na#,S,W./C. 6608,False,9.0,34.375,2.0,2.0,0.0
6,3,"Kraeff, Mr. Theodor",male,#na#,C,349253,True,28.0,7.8958,-4.410101e-09,5.139476e-09,0.0
7,2,"Kelly, Mrs. Florence ""Fannie""",female,#na#,S,223596,False,45.0,13.5,-4.410101e-09,5.139476e-09,1.0
8,3,"Backstrom, Mr. Karl Alfred",male,#na#,S,3101278,False,32.0,15.85,1.0,5.139476e-09,0.0
9,3,"Zabour, Miss. Hileni",female,#na#,C,2665,False,14.499999,14.4542,1.0,5.139476e-09,0.0


In [12]:
learn2 = tabular_learner(dls2, metrics=accuracy)

In [13]:
learn2.fit_one_cycle(3)

epoch,train_loss,valid_loss,accuracy,time
0,0.542845,0.433542,0.606742,00:00
1,0.397512,0.461941,0.606742,00:00
2,0.313034,0.459517,0.606742,00:00


Now lets tree random forests (better at tabular data in general see: https://medium.com/geekculture/why-tree-based-models-beat-deep-learning-on-tabular-data-fcad692b1456)

The following code is from a tutorial on the titanic submissions https://www.kaggle.com/code/alexisbcook/titanic-tutorial

In [14]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [15]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [16]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [17]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


In [18]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"] # sqrt(12) == 3.5
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})

output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


let check against trained data since test does not have survived

In [19]:
X_test_train = pd.get_dummies(train_data[features])
predictions = model.predict(X_test_train)
wrong = 0
for index, prediction in enumerate(predictions):
    if prediction != train_data.at[index, 'Survived']:
        wrong = wrong + 1
print("accuracy", 1 - (wrong / len(predictions)))

accuracy 0.8159371492704826
