### Imports

In [None]:
import pandas as pd
import numpy as np 
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

### Data Preperation 

In [None]:
# File reading/writing
df = pd.read_csv("data.csv", delimiter=";") 

#Rename columns
df = df.rename(columns={'x': 'y','x2': 'y2'})

#Transform Objects into categories
df['Survived'] = df['Survived'].astype('category')

#Drop unnecessary columns
df = df.drop('Name', axis=1)
df = df.drop('Ticket', axis=1)

#Unify labels
df['premium'] = df['premium'].replace({"1": True,"0": False, "Yes": True, "No": False }).astype('category') #'bool' auch möglich

#Map categories on 1 and 0
df ['Sex'] = df['Sex'].map({'male': 1, 'female': 0})

#Ersetze die Cabin Nummer nur mit dem zugehörigen Deck 
df['Cabin'] = df['Cabin'].str[0]

#Groß und Kleinschreibung einheitlich!

### One hot Encoding for Categorical values

In [None]:
#Check value counts before doing one hot encoding
value_count = df['xyz'].value_counts()

# Identify non-numeric columns
non_numeric_cols = df.select_dtypes(include=['object']).columns

# Apply one-hot encoding to non-numeric columns
df = pd.get_dummies(df, columns=non_numeric_cols, drop_first=True)

### Split the data set into training and test data

In [None]:
#Split Dataset along NAs in the classification column
training_data = df.dropna(subset=['Classif.Col']) #falls es nicht geht .copy()
private_data = df[df['Classif.Col'].isna()]

### Treat NAs in the training data set (and NAs from private data with info from Training data)

In [None]:
# First, locate the columns that have at least one missing value (None, NaN, NaT, and similar).
print(df.isna().any())         

#Drop NAs for Columns where those are only few values
training_data = training_data.dropna(subset=["Age", 'Embarked'])

#NAs mit dem Durchschnitt befüllen
training_data['Age'] = training_data['Age'].fillna(training_data['Age'].mean())

#Fülle NAs in den folgenden Spalten mit dem Median Wert 0.0 auf
training_data[['RoomService', 'FoodCourt']] = training_data[['RoomService', 'FoodCourt']].fillna(0.0)

### Check for imbalances

In [None]:
#Check for imbalance
anzahl = training_data['Survived'].value_counts(normalize=False)
print(anzahl)

### Train test split

In [None]:
# Train-test split
train_df, test_df = train_test_split(training_data, test_size=0.20, stratify=training_data['Churn'], random_state=2023+2024)

# Train a random forest model
X_train = train_df.drop(columns=['Churn'])
#Alternativ: X = train_df[features] mit features = ['feature1', 'feature2', etc.]
y_train = train_df['Churn']

# Test Set
X_test = test_df.drop(columns=['Churn'])
#Alternativ: X = test_df[features] mit features = ['feature1', 'feature2', etc.]
y_test = test_df['Churn']

#Optional: Scaling
scaler = StandardScaler()
scaler.fit(X_train)             # Fit the scaler *only* on the train data
X_train = scaler.transform(X_train) # Transform train data
X_test = scaler.transform(X_test)   

### Define the model

In [None]:
# Define Model
train_model = RandomForestClassifier(n_estimators=1000, max_features=3, random_state=42)

### Apply cross validation

In [None]:
# Cross-validation
cv_fits_accuracy = cross_val_score(train_model, X_train, y_train, cv=4, scoring='accuracy')
cv_fits_precision = cross_val_score(train_model, X_train, y_train, cv=4, scoring='precision')
cv_fits_recall = cross_val_score(train_model, X_train, y_train, cv=4, scoring='recall')
cv_fits_BAC = cross_val_score(train_model, X_train, y_train, cv=4, scoring='balanced_accuracy')


print("\nCV-Accuracy:", np.mean(cv_fits_accuracy))
print("CV-Precision:", np.mean(cv_fits_precision))
print("CV-Recall:", np.mean(cv_fits_recall))
print("CV-BAC:", np.mean(cv_fits_BAC))

### Train the Random forest model

In [None]:
# Train the final model
train_model.fit(train_df.drop(columns=['Churn']), train_df['Churn']) # oder variablen nutzen x_train und y_train

### Worst Case: Check Variable importance split

In [None]:
# Variable Importance Plot
importance_values = train_model.feature_importances_
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importance_values})
imp_plot = importance_df.plot(kind='bar', x='Feature', y='Importance', legend=False)
imp_plot.plot()
plt.show()

### Apply model on the private data

In [None]:
# Apply on test set (private data)
test_predictions = train_model.predict(test_df.drop(columns=['DependentVariable']))
print(test_predictions)

#Not important
test_probabilities = train_model.predict_proba(test_df.drop(columns=['Churn']))

### Write the predictions into a CSV file

In [None]:
# Write the test predictions into the private data df into the column 'label'
private_data['mag'] = test_predictions

# Keep only the 'ID' and 'label' columns
result = private_data[['ID', 'mag']]

# Save the result to a CSV file
result.to_csv('predictions.csv', index=False)


In [None]:
#Alternative
test_predictions_df = pd.DataFrame({
    'PassengerId': private_data['ID'],  # Identifiziere Passagiere  
    'Survived': test_predictions,  # Modellvorhersagen
})

# Save the result to a CSV file
test_predictions_df.to_csv('predictions.csv', index=False)

### Optional 

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(test_df['DependentVariable'], test_predictions['DependentVariable'])
print("Confusion Matrix:")
print(conf_matrix)

# BAC; Accuracy, Precision, Recall on Test Data
print("Test-BAC:", balanced_accuracy_score (test_df['Churn'], test_predictions['DependentVariable']))    # import:  from sklearn.metrics import balanced_accuracy_score
print("Test-Accuracy:", accuracy_score(test_df['Churn'], test_predictions['DependentVariable']))
print("Test-Precision:", precision_score(test_df['Churn'], test_predictions['DependentVariable']))
print("Test-Recall:", recall_score(test_df['Churn'], test_predictions['DependentVariable']))