### Imports

In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt

### Handling the Test Data

In [12]:
test = pd.read_csv('test.csv')

#Replace Gender with 1 and 0
test ['Sex'] = test['Sex'].map({'male': 1, 'female': 0})

#Drop unnecessary columns
test = test.drop('Name', axis=1)
test = test.drop('Ticket', axis=1)

#Ersetze die Cabin Nummer nur mit dem zugehörigen Deck 
test['Cabin'] = test['Cabin'].str[0]

# Identify non-numeric columns
non_numeric_cols = test.select_dtypes(include=['object']).columns

# Apply one-hot encoding to non-numeric columns
test = pd.get_dummies(test, columns=non_numeric_cols, drop_first=True)

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    int64  
 3   Age          332 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         417 non-null    float64
 7   Cabin_B      418 non-null    bool   
 8   Cabin_C      418 non-null    bool   
 9   Cabin_D      418 non-null    bool   
 10  Cabin_E      418 non-null    bool   
 11  Cabin_F      418 non-null    bool   
 12  Cabin_G      418 non-null    bool   
 13  Embarked_Q   418 non-null    bool   
 14  Embarked_S   418 non-null    bool   
dtypes: bool(8), float64(2), int64(5)
memory usage: 26.3 KB


### Handling the Training Data

In [13]:
#read Training Data set
training = pd.read_csv('train.csv')

#Drop NAs for Age and Embarked because those are only few values
training = training.dropna(subset=["Age", 'Embarked'])

#Fill NAs for cabin because otherwise too much data would be lost
training['Cabin'] = training['Cabin'].fillna('NoInfo')

#Replace Gender with 1 and 0
training ['Sex'] = training['Sex'].map({'male': 1, 'female': 0})

#Transform Objects into categories
training['Survived'] = training['Survived'].astype('category')

#Ersetze die Cabin Nummer nur mit dem zugehörigen Deck 
training['Cabin'] = training['Cabin'].str[0]

#Drop unnecessary columns
training = training.drop('Name', axis=1)
training = training.drop('Ticket', axis=1)

#Info about training set
training.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  712 non-null    int64   
 1   Survived     712 non-null    category
 2   Pclass       712 non-null    int64   
 3   Sex          712 non-null    int64   
 4   Age          712 non-null    float64 
 5   SibSp        712 non-null    int64   
 6   Parch        712 non-null    int64   
 7   Fare         712 non-null    float64 
 8   Cabin        712 non-null    object  
 9   Embarked     712 non-null    object  
dtypes: category(1), float64(2), int64(5), object(2)
memory usage: 56.4+ KB


### One Hot Encoding für das Trainings Daten Set

In [14]:
# Identify non-numeric columns
non_numeric_cols = training.select_dtypes(include=['object']).columns

# Apply one-hot encoding to non-numeric columns
training = pd.get_dummies(training, columns=non_numeric_cols, drop_first=True)

training = training.drop('Cabin_T', axis=1)
training = training.drop('Cabin_N', axis=1)
training.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  712 non-null    int64   
 1   Survived     712 non-null    category
 2   Pclass       712 non-null    int64   
 3   Sex          712 non-null    int64   
 4   Age          712 non-null    float64 
 5   SibSp        712 non-null    int64   
 6   Parch        712 non-null    int64   
 7   Fare         712 non-null    float64 
 8   Cabin_B      712 non-null    bool    
 9   Cabin_C      712 non-null    bool    
 10  Cabin_D      712 non-null    bool    
 11  Cabin_E      712 non-null    bool    
 12  Cabin_F      712 non-null    bool    
 13  Cabin_G      712 non-null    bool    
 14  Embarked_Q   712 non-null    bool    
 15  Embarked_S   712 non-null    bool    
dtypes: bool(8), category(1), float64(2), int64(5)
memory usage: 50.9 KB


### Check for imbalances in the Data set

In [15]:
#Check for imbalance
anzahl = training['Survived'].value_counts(normalize=False)
print(anzahl)


Survived
0    424
1    288
Name: count, dtype: int64


### Definiere das Random Forest Modell und trainiere es auf die Variable 'Survived'

In [16]:
# Define Model
train_model = RandomForestClassifier(n_estimators=500, max_features=3, random_state=0)

# Train a random forest model
X = training.drop(columns=['Survived'])
y = training['Survived']

### Cross Validation mit einem 4k fold

In [17]:
# Cross-validation
cv_fits_accuracy = cross_val_score(train_model, X, y, cv=4, scoring='accuracy')
cv_fits_precision = cross_val_score(train_model, X, y, cv=4, scoring='precision')
cv_fits_recall = cross_val_score(train_model, X, y, cv=4, scoring='recall')

print("\nCV-Accuracy:", np.mean(cv_fits_accuracy))
print("CV-Precision:", np.mean(cv_fits_precision))
print("CV-Recall:", np.mean(cv_fits_recall))


CV-Accuracy: 0.8075842696629214
CV-Precision: 0.8495578701362906
CV-Recall: 0.6631944444444444


### Train final Model

In [18]:
# Train the final model
train_model.fit(training.drop(columns=['Survived']), training['Survived'])

### Teste das Modell auf unserem Test Datensatz

In [19]:
# Apply on test set
test_predictions = train_model.predict(test)
print(test_predictions)

[0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0
 0 0 1 0 0 0 1 1 0 1 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 0 0 1
 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0
 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0
 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0
 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0
 0 0 1 0 1 0 0 1 0 0 0]


### Schreibe die Vorhersagen in eine Variable und von dort in die benötigte Prediction.csv Datei

In [20]:
# Erstelle ein DataFrame mit den Vorhersagen und Wahrscheinlichkeiten
test_predictions_df = pd.DataFrame({
    'PassengerId': test['PassengerId'],  # Identifiziere Passagiere  
    'Survived': test_predictions,  # Modellvorhersagen
})
test_predictions_df.to_csv('prediction_file.csv', index=False)