In [17]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Load the train.csv data
train_data = pd.read_csv(r"train.csv")
test_data = pd.read_csv(r"test.csv")

train_data = train_data.drop(['Ticket', 'Cabin'], axis=1)
test_data = test_data.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_data, test_data]
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

train_data.head()
train_data = train_data.drop(['Name', 'PassengerId'], axis=1)
test_data = test_data.drop(['Name'], axis=1)
combine = [train_data, test_data]
train_data.shape, test_data.shape
guess_ages = np.zeros((2,3))
guess_ages
train_data['AgeBand'] = pd.cut(train_data['Age'], 5)
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
train_data.head()
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()

            # age_mean = guess_df.mean()
            # age_std = guess_df.std()
            # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

train_data.head()
train_data['AgeBand'] = pd.cut(train_data['Age'], 5)
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
train_data.head()
train_data = train_data.drop(['AgeBand'], axis=1)
combine = [train_data, test_data]
train_data.head()
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_data[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train_data[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()
train_data = train_data.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test_data = test_data.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [train_data, test_data]

train_data.head()
for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

train_data.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)
freq_port = train_data.Embarked.dropna().mode()[0]
freq_port
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
test_data['Fare'].fillna(test_data['Fare'].dropna().median(), inplace=True)
test_data.head()
train_data['FareBand'] = pd.qcut(train_data['Fare'], 4)
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_data = train_data.drop(['FareBand'], axis=1)
combine = [train_data, test_data]
    
train_data.head(10)

# Separate features and target variable
X_train = train_data.drop("Survived", axis=1)
Y_train = train_data["Survived"]
X_test  = test_data.drop("PassengerId", axis=1).copy()
X_train.head(), Y_train.shape, X_test.shape
# Logistic Regression


(   Pclass  Sex  Age  Fare  Embarked  Title  IsAlone  Age*Class
 0       3    0    0     0         0      1        0          0
 1       1    1    0     3         1      3        0          0
 2       3    1    0     1         0      2        1          0
 3       1    1    0     3         0      3        0          0
 4       3    0    0     1         0      1        1          0,
 (891,),
 (418, 8))

In [18]:

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log
#Corr 
coeff_df = pd.DataFrame(train_data.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)
# Initialize classifiers for each paradigm
classifiers = {
    'Statistik - Logistic Regression': LogisticRegression(),
    'Statistik - Naive Bayes': GaussianNB(),
    'Geometri - KNN': KNeighborsClassifier(),
    'Geometri - Linear SVM': SVC(kernel='linear'),
    'Geometri - SVM (RBF kernel)': SVC(kernel='rbf'),
    'Neurosains - MLP (1 hidden layer)': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500),
    'Neurosains - MLP (2 hidden layers)': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500),
    'Logika - Decision Tree': DecisionTreeClassifier(),
    'Sosial - Random Forest': RandomForestClassifier(),
    'Sosial - Gradient Boosting': GradientBoostingClassifier()
}

# Define evaluation metrics
metrics = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'F1': make_scorer(f1_score)
}

# Perform 5-fold cross validation for each classifier and calculate performance metrics
results = []
for name, clf in classifiers.items():
    fold_results = {'Classifier': name}
    for metric_name, scorer in metrics.items():
        scores = cross_val_score(clf, X_train, Y_train, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring=scorer)
        fold_results[metric_name] = scores.mean()
    results.append(fold_results)

# Create a DataFrame to display results
results_df = pd.DataFrame(results)

# Display results
print(results_df)




                           Classifier  Accuracy  Precision    Recall        F1
0     Statistik - Logistic Regression  0.781150   0.722434  0.694545  0.708138
1             Statistik - Naive Bayes  0.660925   0.557393  0.909082  0.683285
2                      Geometri - KNN  0.805831   0.789194  0.667554  0.721211
3               Geometri - Linear SVM  0.786762   0.739372  0.679792  0.707965
4         Geometri - SVM (RBF kernel)  0.781144   0.704104  0.743309  0.721682
5   Neurosains - MLP (1 hidden layer)  0.809240   0.790978  0.668335  0.736514
6  Neurosains - MLP (2 hidden layers)  0.818178   0.795474  0.685746  0.743797
7              Logika - Decision Tree  0.801356   0.792650  0.652664  0.710121
8              Sosial - Random Forest  0.809227   0.779800  0.699807  0.720072
9          Sosial - Gradient Boosting  0.814826   0.822474  0.661437  0.729095


In [19]:
results_df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1
0,Statistik - Logistic Regression,0.78115,0.722434,0.694545,0.708138
1,Statistik - Naive Bayes,0.660925,0.557393,0.909082,0.683285
2,Geometri - KNN,0.805831,0.789194,0.667554,0.721211
3,Geometri - Linear SVM,0.786762,0.739372,0.679792,0.707965
4,Geometri - SVM (RBF kernel),0.781144,0.704104,0.743309,0.721682
5,Neurosains - MLP (1 hidden layer),0.80924,0.790978,0.668335,0.736514
6,Neurosains - MLP (2 hidden layers),0.818178,0.795474,0.685746,0.743797
7,Logika - Decision Tree,0.801356,0.79265,0.652664,0.710121
8,Sosial - Random Forest,0.809227,0.7798,0.699807,0.720072
9,Sosial - Gradient Boosting,0.814826,0.822474,0.661437,0.729095
