In our EDA and Data Cleaning, we found some possible outliers. We haven't removed them since they seemed important and could be possible, but let's analyze them further using IForest

Import the required libraries

In [41]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

Print library versions to avoid conflicts

In [42]:
print(f'Pandas Version: {pd.__version__}') 
print(f'Numpy Version: {np.__version__}') 
print(f'Matplotlib version: {matplotlib.__version__}')
print(f'Seaborn version: {sns.__version__}')

Pandas Version: 1.5.3
Numpy Version: 1.23.5
Matplotlib version: 3.6.3
Seaborn version: 0.12.2


Add some configurations

In [43]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.precision', 3)
pd.set_option('plotting.backend', 'matplotlib') 
pd.options.mode.chained_assignment = None
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:.4f}'.format

%matplotlib inline
plt.rcParams["figure.figsize"] = (15,7)

Load train and test datasets

In [44]:
dtypes = {'PassengerId': 'int16', 'Survived': 'int8', 'SibSp': 'int8', 'Parch': 'int8', 'Sex': 'category', 'Embarked': 'category', 'Fare': 'float16', 'Age': 'float16'}

ds_titanic_train = pd.read_csv(r"C:\Users\Administrador\Documents\IA\Proyectos\Titanic\Datasets\ds_train_processed.csv", dtype = dtypes, encoding = 'unicode_escape')
ds_work = ds_titanic_train.copy()

ds_titanic_test = pd.read_csv(r"C:\Users\Administrador\Documents\IA\Proyectos\Titanic\Datasets\ds_test_processed.csv", dtype = dtypes, encoding = 'unicode_escape')
ds_test = ds_titanic_test.copy()

Define and fit the model

In [70]:
from sklearn.ensemble import IsolationForest

random_state = np.random.RandomState(42)
model = IsolationForest(n_estimators=100,max_samples='auto',contamination=float(0.3),random_state=random_state, max_features = 2)

model.fit(ds_work[["Age", "Fare"]])

print(model.get_params())

{'bootstrap': False, 'contamination': 0.3, 'max_features': 2, 'max_samples': 'auto', 'n_estimators': 100, 'n_jobs': None, 'random_state': RandomState(MT19937) at 0x260A5A0FD40, 'verbose': 0, 'warm_start': False}


Score the data to obtain anomaly scores

In [71]:
ds_work['score'] = model.decision_function(ds_work[["Age", "Fare"]])

ds_work['anomaly_score'] = model.predict(ds_work[["Age", "Fare"]])

ds_work[ds_work['anomaly_score']==-1]

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Fare,Embarked,score,anomaly_score
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,71.3125,C,-0.0126,-1
6,7,0,"McCarthy, Mr. Timothy J",male,54.0,0,0,51.875,S,-0.0535,-1
7,8,0,"Palsson, Master. Gosta Leonard",male,2.0,3,1,21.0781,S,-0.0073,-1
10,11,1,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,16.7031,S,-0.024,-1
11,12,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,26.5469,S,-0.0372,-1
15,16,1,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,16.0,S,-0.0341,-1
16,17,0,"Rice, Master. Eugene",male,2.0,4,1,29.125,Q,-0.0056,-1
24,25,0,"Palsson, Miss. Torborg Danira",female,8.0,3,1,21.0781,S,-0.019,-1
27,28,0,"Fortune, Mr. Charles Alexander",male,19.0,3,2,263.0,S,-0.2015,-1
31,32,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,0.0,1,0,146.5,C,-0.1597,-1


Model Evaluation

In [72]:
outliers_counter = len(ds_work[(ds_work["Age"] > 64.81) | (ds_work["Fare"] > 63.63) | (ds_work["SibSp"] >= 3) | (ds_work["Parch"] > 0)])
#outliers_counter = len(ds_work[ds_work["Fare"] > 50])
print("Accuracy percentage:", 100*list(ds_work['anomaly_score']).count(-1)/(outliers_counter))

#accuracy = 100*list(ds_work['anomaly_score']).count(-1)/(anomaly_count)
#print("Accuracy of the model:", accuracy)

Accuracy percentage: 91.75257731958763


In [69]:
#len(ds_work[(ds_work["Age"] > 64.81) | (ds_work["Fare"] > 63.63) | (ds_work["SibSp"] >= 3) | (ds_work["Parch"] > 0)]) / len(ds_work)

0.32733408323959506

In [86]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
import numpy as np

In [87]:
X = ds_work[["Age", "Fare", "SibSp", "Parch"]]
y = ds_work["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [88]:
clf = IsolationForest(random_state=0, contamination = 0.3)
clf.fit(X_train)
y_pred = clf.predict(X_test)

In [89]:
pred = pd.DataFrame({'pred': y_pred})
pred['y_pred'] = np.where(pred['pred'] == -1, 1, 0)
y_pred = pred['y_pred'] 
print("Precision:", precision_score(y_test, y_pred))

Precision: 0.5903614457831325
