In [172]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from utilities import set_multiple_columns_datatype
from sklearn.svm import SVC


In [173]:
#Import data
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')

In [174]:
columns = {"Pclass":'category', 'Embarked':'category', "Sex":'category'}
train = set_multiple_columns_datatype(train, columns)

In [175]:
#Inspect
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    int64   
 2   Pclass       891 non-null    category
 3   Name         891 non-null    object  
 4   Sex          891 non-null    category
 5   Age          714 non-null    float64 
 6   SibSp        891 non-null    int64   
 7   Parch        891 non-null    int64   
 8   Ticket       891 non-null    object  
 9   Fare         891 non-null    float64 
 10  Cabin        204 non-null    object  
 11  Embarked     889 non-null    category
dtypes: category(3), float64(2), int64(4), object(3)
memory usage: 65.8+ KB


In [176]:
train.describe()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,38.0,1.0,0.0,31.0
max,891.0,1.0,80.0,8.0,6.0,512.3292


In [177]:
train_dummies = pd.get_dummies(train.drop(['Cabin', 'Name', 'Ticket'], axis=1))

In [178]:
X = train_dummies.drop(['Survived'], axis=1)
y = train['Survived']

X_train_dummies, X_test_dummies, y_train_dummies, y_test_dummies = train_test_split(X, y, test_size=0.2, random_state=42)

In [179]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC()
}

pipelines = {}
for name, model in models.items():
    pipelines[name] = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])

In [180]:
results = {}
for name, pipeline in pipelines.items():
    scores = cross_val_score(pipeline, X_train_dummies, y_train_dummies, cv=5, scoring='accuracy')
    results[name] = scores
    print(f'{name}: {scores.mean():.2f} ± {scores.std():.2f}')


Logistic Regression: 0.80 ± 0.03
Random Forest: 0.80 ± 0.01
Gradient Boosting: 0.82 ± 0.02
SVM: 0.82 ± 0.03


In [181]:
for name, pipeline in pipelines.items():
    pipeline.fit(X_train_dummies, y_train_dummies)
    y_pred = pipeline.predict(X_test_dummies)
    accuracy = accuracy_score(y_test_dummies, y_pred)
    precision = precision_score(y_test_dummies, y_pred)
    recall = recall_score(y_test_dummies, y_pred)
    f1 = f1_score(y_test_dummies, y_pred)
    print(f'\n{name} Performance on Test Set:')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')



Logistic Regression Performance on Test Set:
Accuracy: 0.79
Precision: 0.77
Recall: 0.72
F1 Score: 0.74

Random Forest Performance on Test Set:
Accuracy: 0.84
Precision: 0.84
Recall: 0.76
F1 Score: 0.79

Gradient Boosting Performance on Test Set:
Accuracy: 0.82
Precision: 0.82
Recall: 0.72
F1 Score: 0.76

SVM Performance on Test Set:
Accuracy: 0.81
Precision: 0.84
Recall: 0.66
F1 Score: 0.74


In [182]:
print(X.columns)


Index(['PassengerId', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')


In [184]:
from model import TitanicModelPipeline
X = train.drop(['Survived'], axis=1)
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_pipeline = TitanicModelPipeline()
# Train new model
model_pipeline.fit(X_train, y_train)

# Evaluation
evaluation_results = model_pipeline.evaluate(X_test, y_test)
best_params = model_pipeline.best_params()

print(f'Best Parameters: {best_params}')
print(f'\nEvaluation Results on Test Set:')
for metric, value in evaluation_results.items():
    print(f'{metric.capitalize()}: {value:.2f}')

ValueError: 
All the 90 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    raise
  File "C:\Python312\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
  File "C:\Python312\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    if self._final_estimator != "passthrough":
     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X,
       
  File "C:\Python312\Lib\site-packages\joblib\memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    be multiplied by ``weight``.
                  ^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    "The transformer outputs a scipy sparse matrix. "
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\site-packages\sklearn\base.py", line 1101, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    "The transformer outputs a scipy sparse matrix. "
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Ari Castillo\Documents\Programas\projectos\Titanic\model.py", line 20, in transform
    def fit(self, X, y):
                         
  File "C:\Python312\Lib\site-packages\pandas\core\ops\common.py", line 76, in new_method
    return method(self, other)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\site-packages\pandas\core\arraylike.py", line 202, in __mul__
    return self._arith_method(other, operator.mul)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\site-packages\pandas\core\series.py", line 6126, in _arith_method
    return base.IndexOpsMixin._arith_method(self, other, op)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\site-packages\pandas\core\base.py", line 1382, in _arith_method
    result = ops.arithmetic_op(lvalues, rvalues, op)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\site-packages\pandas\core\ops\array_ops.py", line 273, in arithmetic_op
    res_values = op(left, right)
                 ^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\site-packages\pandas\core\arrays\categorical.py", line 1692, in __array_ufunc__
    raise TypeError(
TypeError: Object with dtype category cannot perform the numpy op multiply
