In [139]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import re

In [140]:
train = pd.read_csv("./train.csv") #Datensatz angeben


In [141]:
# dropping irrelevant columns
train.drop('Cabin', axis=1, inplace=True)
train.drop('PassengerId', axis=1, inplace=True)
train.drop('Ticket', axis=1, inplace=True)
#train = train.dropna(subset=['Embarked'])
train.fillna(train['Embarked'].mode()[0])

# Replace Sex Values male=1, female=2
train['Sex'] = train['Sex'].str.lower().map({'male': 1, 'female': 2}).fillna(train['Sex'])# Count rows with NaN values in any column
train['Embarked'] = train['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

train['Age'] = train['Age'].fillna(train['Age'].median())
#train = train.dropna(subset=['Age']) # Had no effect on the result 0,07 -> 0,08
train


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.2500,2.0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,71.2833,0.0
2,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,7.9250,2.0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,53.1000,2.0
4,0,3,"Allen, Mr. William Henry",1,35.0,0,0,8.0500,2.0
...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,13.0000,2.0
887,1,1,"Graham, Miss. Margaret Edith",2,19.0,0,0,30.0000,2.0
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",2,28.0,1,2,23.4500,2.0
889,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,30.0000,0.0


In [142]:
# Modify Dataset and Exploration

nan_count = train.isna().any(axis=1).sum()
# To identify which columns have NaN values and count them
nan_columns = train.isna().sum()
print(nan_columns)

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64


In [143]:
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train['IsAlone'] = np.where(train['FamilySize'] == 1, 1, 0)

# Extract Title from Name
def get_title(name):
    title_search = re.search(r' ([A-Za-z]+)\.', name)  # Verwenden eines rohen String-Literals
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(get_title)
train['Title'] = train['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')
train['Title'] = train['Title'].map({"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Rare": 4})
train['Title'] = train['Title'].fillna(0)
train

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title
0,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.2500,2.0,2,0,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,71.2833,0.0,2,0,2
2,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,7.9250,2.0,1,1,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,53.1000,2.0,2,0,2
4,0,3,"Allen, Mr. William Henry",1,35.0,0,0,8.0500,2.0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,13.0000,2.0,1,1,4
887,1,1,"Graham, Miss. Margaret Edith",2,19.0,0,0,30.0000,2.0,1,1,1
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",2,28.0,1,2,23.4500,2.0,4,0,1
889,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,30.0000,0.0,1,1,0


In [144]:
features = ["Sex", "Pclass", "Fare", "Age", "FamilySize", "IsAlone", "Title"]
x = train[features]
y = train["Survived"]
# Splitting the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# Checking the sizes of the datasets
print("x_train size:", x_train.shape)
print("x_test size:", x_test.shape)
print("y_train size:", y_train.shape)
print("y_test size:", y_test.shape)


x_train size: (712, 7)
x_test size: (179, 7)
y_train size: (712,)
y_test size: (179,)


In [145]:
def evaluate_model(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

In [146]:
# Applying Logistic Regression
model = LogisticRegression(max_iter=1000, solver='saga', random_state=42)
model.fit(x_train, y_train)



In [147]:
'''
best = 1
best_acc = 0
for i in range(1, 800):
    rf_model = RandomForestClassifier(n_estimators=i, random_state=42)
    rf_model.fit(x_train, y_train)
    rf_y_pred = rf_model.predict(x_test)
    if accuracy_score(y_test, rf_y_pred) > best_acc:
        best = i
        best_acc = accuracy_score(y_test, rf_y_pred)
        
rf_model = RandomForestClassifier(n_estimators=best, random_state=42)
rf_model.fit(x_train, y_train)
rf_y_pred = rf_model.predict(x_test)
print(best)
print("Random Forest Evaluation:")
evaluate_model(y_test, rf_y_pred)
'''
rf_model = RandomForestClassifier(n_estimators=111, random_state=42)
rf_model.fit(x_train, y_train)
rf_y_pred = rf_model.predict(x_test)


In [148]:
y_pred = model.predict(x_test)

# Evaluation
print("Logistic Regression:")
evaluate_model(y_test, y_pred)

print("Random Forest Evaluation:")
evaluate_model(y_test, rf_y_pred)
# Cross-validation
cv_scores = cross_val_score(model, x, y, cv=10)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())


Logistic Regression:
Accuracy: 0.7653631284916201
Confusion Matrix:
 [[95 10]
 [32 42]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.90      0.82       105
           1       0.81      0.57      0.67        74

    accuracy                           0.77       179
   macro avg       0.78      0.74      0.74       179
weighted avg       0.77      0.77      0.76       179

Random Forest Evaluation:
Accuracy: 0.7988826815642458
Confusion Matrix:
 [[87 18]
 [18 56]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.83      0.83       105
           1       0.76      0.76      0.76        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179





Cross-validation scores: [0.66666667 0.74157303 0.69662921 0.85393258 0.78651685 0.7752809
 0.79775281 0.79775281 0.86516854 0.83146067]
Mean cross-validation score: 0.7812734082397003




In [149]:
# Visual Explorative Data Analysis (EDA)
plt.figure(figsize=(10, 6))
sns.heatmap(train.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Plotting the distribution of 'Age'
plt.figure(figsize=(10, 6))
sns.histplot(train['Age'], kde=True)
plt.title('Distribution of Age')
plt.show()

# Visualizing the survival rate by sex
plt.figure(figsize=(10, 6))
sns.barplot(x='Sex', y='Survived', data=train)
plt.title('Survival Rate by Sex')
plt.show()

# Visualizing the survival rate by class
plt.figure(figsize=(10, 6))
sns.barplot(x='Pclass', y='Survived', data=train)
plt.title('Survival Rate by Class')
plt.show()

ValueError: could not convert string to float: 'Braund, Mr. Owen Harris'

<Figure size 1000x600 with 0 Axes>