In [242]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [243]:
#load the data
data = pd.read_csv(".\\Titanic-Dataset.csv")
data.columns = data.columns.str.lower()
data

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [244]:
# Encode the values in the 'sex' column
data['sex'] = LabelEncoder().fit_transform(data['sex'])

In [245]:
# Fill missing values in 'embarked' column with the mode (most frequent value) and encode the values
data['embarked'] = data['embarked'].fillna(data['embarked'].mode()[0])
data['embarked'] = LabelEncoder().fit_transform(data['embarked'])


In [246]:
# Fill missing values in the 'age' column with the mean age
data['age'] = data['age'].fillna(data['age'].mean())

In [247]:
# Fill in missing values in the 'cabin' column with 'Missing'
data['cabin'] = data['cabin'].fillna('Missing')
data['cabin'] = LabelEncoder().fit_transform(data['cabin'])


In [248]:
# Fill in missing values in the 'fare' column with the median fare
data['fare'].fillna(data['fare'].median())


0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: fare, Length: 891, dtype: float64

In [249]:
# Drop unnecessary columns
data = data.drop(columns=['passengerid', 'name', 'ticket', 'cabin'])


In [250]:
# Split data in test and train data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


In [251]:
data.dtypes
titanic_features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']

In [None]:
# Train the model with pycaret
from pycaret.classification import *
# Define target and setup experiment
experiment = setup(
    data=train_data,
    target='survived',
    categorical_features=titanic_features,
)
best = compare_models(sort = 'MAE', n_select=5)
print(best)
evaluate_model(best)
save_model(best, 'pycaret_model')
print("Model saved as pycaret_model.pkl!")



Unnamed: 0,Description,Value
0,Session id,7876
1,Target,survived
2,Target type,Binary
3,Original data shape,"(712, 8)"
4,Transformed data shape,"(712, 23)"
5,Transformed train set shape,"(498, 23)"
6,Transformed test set shape,"(214, 23)"
7,Categorical features,7
8,Preprocess,True
9,Imputation type,simple


ValueError: Estimator xgboost not available. Please see docstring for list of available estimators.

In [None]:
X_test = test_data[titanic_features]
y_test = test_data['survived']
X_test.head()
# Make predictions on the test data using the PyCaret model
pycaret_predictions = predict_model(tuned_xgb, data=X_test)
# Add a column to indicate if the person survived or drowned
pycaret_predictions['outcome'] = pycaret_predictions['prediction_label'].apply(lambda x: 1 if x == 1 else 0)
pycaret_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': pycaret_predictions['outcome']})
print(pycaret_comparison)

# Calculate accuracy for the PyCaret model
pycaret_test_accuracy = accuracy_score(y_test, pycaret_predictions['outcome'])
print("PyCaret Model Test Accuracy:", pycaret_test_accuracy)

     Actual  Predicted
709       1          0
439       0          0
840       0          0
720       1          1
39        1          1
..      ...        ...
433       0          0
773       0          0
25        1          0
84        1          1
10        1          1

[179 rows x 2 columns]
PyCaret Model Test Accuracy: 0.8100558659217877


In [254]:
# Train the model using a Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(train_data[titanic_features], train_data['survived'])
y_pred_rf = rf_model.predict(X_test)

# Convert predictions to binary outcomes
y_pred_rf_binary = [1 if pred >= 0.5 else 0 for pred in y_pred_rf]

# Calculate Mean Squared Error
rf_mse = mean_squared_error(y_test, y_pred_rf)
print("Random Forest Model MSE:", rf_mse)

# Calculate Accuracy
rf_accuracy = accuracy_score(y_test, y_pred_rf_binary)
print("Random Forest Model Accuracy:", rf_accuracy)

Random Forest Model MSE: 0.14089399772192648
Random Forest Model Accuracy: 0.8100558659217877
