# Import Libary

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, root_mean_squared_error, f1_score, roc_auc_score, accuracy_score, precision_score, recall_score, r2_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Read Data

In [None]:
data = pd.read_csv("Data/train.csv")
data

# Simple Preprocessing

In [None]:
data_out = data.pop("Survived")
data["Survived"] = data_out
data

In [None]:
data = data.drop(columns=["PassengerId", "Name", "Ticket"])

In [None]:
data

# EDA

In [None]:
data.columns

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
for column in data.columns[:-1]:
    if (data[column].dtype != "object"):
        sns.histplot(data=data, x=column)
        plt.show()

In [None]:
for column in data.columns[:-1]:
    if (data[column].dtype != "object"):
        sns.boxplot(data=data, x=column)
        plt.show()

In [None]:
for column in data.columns[:-1]:
    if (data[column].dtype != "object"):
        sns.scatterplot(data=data, x=column, y="Survived")
        plt.show()

In [None]:
for column in data.columns[:-1]:
    if (data[column].dtype != "object"):
        print (f"Column {column}, Skewness: {data[column].skew()}, Kurtosis: {data[column].kurt()}")

# Data Preprocessing

In [None]:
data.isna().sum()

In [None]:
missing_value_percentage = data.isna().sum() / len(data)
missing_value_percentage

In [None]:
for key, value in missing_value_percentage.items():
    if (value > 0.5):
        data = data.drop(columns=key)

In [None]:
data

In [None]:
for column in data.columns[:-1]:
    if (data[column].isna().any()):
        if (data[column].dtypes == "object"):
            data[column] = data[column].fillna(data[column].mode()[0])
        else:
            data[column] = data[column].fillna(data[column].median())

In [None]:
data.head()

In [None]:
data.isna().sum()

In [None]:
data.dtypes

In [None]:
encoder = LabelEncoder()
encoded = {}

for column in data.columns[:-1]:
    if (data[column].dtype == "object"):
        data[column] = encoder.fit_transform(data[column])
        encoded[column] = {i:class_name for i, class_name in enumerate(encoder.classes_)}

In [None]:
encoded

In [None]:
for key, value in encoded.items():
    print (f"Key: {key}, Value: {value}")

# Modelling

### Pisahkan data independent dan data dependent

In [None]:
x = data[data.columns[:-1]]
y = data[data.columns[-1]]

In [None]:
x.head()

In [None]:
y.head()

### Splitting Data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

### Buat Model

In [None]:
model = LinearRegression()
model.fit(X=x_train, y=y_train)

In [None]:
y_pred = model.predict(x_test)
y_pred

In [None]:
koefisien = model.coef_
intercept = model.intercept_

print (f"Linear Regression Model: {intercept} + {koefisien}x")

# Evaluation

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print (f"MSE: {mse}")
print (f"RMSE: {rmse}")
print (f"R2 Score: {r2}")

In [None]:
y_train_pred = model.predict(x_train)

In [None]:
train_mse = mean_squared_error(y_train_pred, y_train)
print (f"MSE: {train_mse}")

In [None]:
# Evaluasi apakah model mengalami overfitting
if train_mse < mse:
    print("\nModel mengalami overfitting, karena kesalahan pada data training lebih rendah dibandingkan pada data testing.")
else:
    print("\nModel tidak mengalami overfitting.")

# Regularization

Intinya digunakan untuk mengurangi overfitting

In [None]:
from sklearn.linear_model import Lasso, Ridge

In [None]:
rigde_model = Ridge(alpha=0.2)
lasso_model = Lasso(alpha=0.2)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
rigde_model.fit(X=x_train, y=y_train)
lasso_model.fit(X=x_train, y=y_train)

In [None]:
ridgePred = rigde_model.predict(x_test)
lassoPred = lasso_model.predict(x_test)

In [None]:
# Ridge

mse = mean_squared_error(y_test, ridgePred)
print (f"MSE: {mse}")

r2 = r2_score(ridgePred, y_test)
print (f"R2 Score: {r2}")

a = root_mean_squared_error(ridgePred, y_test)
print (f"RMSE: {a}")

In [None]:
# Evaluasi apakah model mengalami overfitting
if train_mse < mse:
    print("\nModel mengalami overfitting, karena kesalahan pada data training lebih rendah dibandingkan pada data testing.")
else:
    print("\nModel tidak mengalami overfitting.")

In [None]:
# Lasso

mse = mean_squared_error(y_test, lassoPred)
print (f"MSE: {mse}")

r2 = r2_score(lassoPred, y_test)
print (f"R2 Score: {r2}")

a = root_mean_squared_error(lassoPred, y_test)
print (f"RMSE: {a}")

In [None]:
# Evaluasi apakah model mengalami overfitting
if train_mse < mse:
    print("\nModel mengalami overfitting, karena kesalahan pada data training lebih rendah dibandingkan pada data testing.")
else:
    print("\nModel tidak mengalami overfitting.")

Terlihat hasilnya justru sekarang sepertinya malah jadi underfitting, karena MSE dari test tanpa regularization lebih kecil dibandingkan menggunakan regularization

# Classification

## Logistic Regression

In [None]:
model2 = LogisticRegression()
model2.fit(X=x_train, y=y_train)

In [None]:
y_pred2 = model2.predict(x_test)
y_pred2

In [None]:
mse = mean_squared_error(y_test, y_pred2)
r2 = r2_score(y_test, y_pred2)
rmse = root_mean_squared_error(y_test, y_pred2)

precission = precision_score(y_test, y_pred2)
recall = recall_score(y_test, y_pred2)
accuracy = accuracy_score(y_test, y_pred2)
roc = roc_auc_score(y_test, y_pred2)

confusion = confusion_matrix(y_test, y_pred2)
classification = classification_report(y_test, y_pred2)


print (f"MSE: {mse}")
print (f"RMSE: {rmse}")
print (f"R2 Score: {r2}")

print (f"Precission: {precission}")
print (f"Recall: {recall}")
print (f"Accuracy Score: {accuracy}")
print (f"ROC Score: {roc}")

print(f"Confusion Matrix: {confusion}")
print(f"Classification Report: {classification}")

## Naive Bayes

In [None]:
model3 = GaussianNB()
model3.fit(X=x_train, y=y_train)

In [None]:
y_pred3 = model3.predict(x_test)
y_pred3

In [None]:
mse = mean_squared_error(y_test, y_pred3)
r2 = r2_score(y_test, y_pred3)
rmse = root_mean_squared_error(y_test, y_pred3)

precission = precision_score(y_test, y_pred3)
recall = recall_score(y_test, y_pred3)
accuracy = accuracy_score(y_test, y_pred3)
roc = roc_auc_score(y_test, y_pred3)

confusion = confusion_matrix(y_test, y_pred3)
classification = classification_report(y_test, y_pred3)


print (f"MSE: {mse}")
print (f"RMSE: {rmse}")
print (f"R2 Score: {r2}")

print (f"Precission: {precission}")
print (f"Recall: {recall}")
print (f"Accuracy Score: {accuracy}")
print (f"ROC Score: {roc}")

print(f"Confusion Matrix: {confusion}")
print(f"Classification Report: {classification}")

Karena nilai-nilai evaluasi matrix dari Logistic Regression > Naive Bayes, maka kesimpulannya:
1. Dataset tersebut mengandung berbagai data yang saling berkaitan atau memiliki korelasi satu dengan yang lainnya (karena, Naive Bayes akan tinggi nilainya jika dataset independent, akibat asumsi teori Naive tersebut. Akan tetapi, karena Logistic Regression nilai evaluasinya lebih tinggi, kita bisa mengambil kesimpulan bahwa dataset tersebut memiliki data-data yang cukup saling berkorelasi satu dengan yang lainnya)

2. Data tidak berdistribusi normal (karena nilai evaluasi dari Logistic Regression lebih tinggi)

3. Pemisahan kelas / fitur bersifat linear

4. Data memiliki noise yang bisa dikatakan rendah (karena jika noise tinggi, maka seharusnya akurasi Naive Bayes bisa lebih tinggi dibandingkan dengan Logistic Regression).

5. Dataset memiliki ukuran/jumlah baris yang cukup untuk melakukan prediksi

# Additional

## Correlation Feature - Preprocessing

In [None]:
def correlation(data_df, threshold) :
  corr_col = set()
  corr_matrix = data_df.corr()
  print(corr_matrix)
  for i in range(len(corr_matrix.columns)) :
    for j in range(i + 1, len(corr_matrix.columns)) :
      if(np.abs(corr_matrix.iloc[(i, j)]) > threshold) :
        corr_col.add(corr_matrix.columns[j])
  return corr_col

In [None]:
corr_col = correlation(x_train, threshold = 0.9)
print(corr_col)
print(len(corr_col))

In [None]:
x_train = x_train.drop(columns = corr_col, axis = 1)
x_test = x_test.drop(columns = corr_col, axis = 1)

In [None]:
x_train

## Scaling

In [None]:
scaler_x_minmax = MinMaxScaler()
scaler_x_standard = StandardScaler()

In [None]:
def scaling(x1, x2, scaler_x) :
  x1 = scaler_x.fit_transform(x1)
  x2 = scaler_x.transform(x2)
  return x1, x2, scaler_x

In [None]:
x_train1, x_test1, scaler_x_minmax = scaling(x_train, x_test, scaler_x_minmax)

In [None]:
x_train2, x_test2, scaler_x_standard = scaling(x_train, x_test, scaler_x_standard)

In [None]:
model = LogisticRegression() #LogisticRegression with MinMaxScaler()
history = model.fit(x_train1, y_train)
y_pred = model.predict(x_test1)
y_pred_prob = model.predict_proba(x_test1)[:, 1]
print(f"accuracy_score : {accuracy_score(y_test, y_pred)}, recall_score : {recall_score(y_test, y_pred)}, precision_score : {precision_score(y_test, y_pred)}, roc_auc_score : {roc_auc_score(y_test, y_pred_prob)}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
model = LogisticRegression() #LogisticRegression with StandardScaler()
history = model.fit(x_train2, y_train)
y_pred = model.predict(x_test2)
y_pred_prob = model.predict_proba(x_test2)[:, 1]
print(f"accuracy_score : {accuracy_score(y_test, y_pred)}, recall_score : {recall_score(y_test, y_pred)}, precision_score : {precision_score(y_test, y_pred)}, roc_auc_score : {roc_auc_score(y_test, y_pred_prob)}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
model = GaussianNB() #GaussianNB with MinMaxScaler()
history = model.fit(x_train1, y_train)
y_pred = model.predict(x_test1)
y_pred_prob = model.predict_proba(x_test1)[:, 1]
print(f"accuracy_score : {accuracy_score(y_test, y_pred)}, recall_score : {recall_score(y_test, y_pred)}, precision_score : {precision_score(y_test, y_pred)}, roc_auc_score : {roc_auc_score(y_test, y_pred_prob)}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
model = GaussianNB() #GaussianNB with StandardScaler()
history = model.fit(x_train2, y_train)
y_pred = model.predict(x_test2)
y_pred_prob = model.predict_proba(x_test2)[:, 1]
print(f"accuracy_score : {accuracy_score(y_test, y_pred)}, recall_score : {recall_score(y_test, y_pred)}, precision_score : {precision_score(y_test, y_pred)}, roc_auc_score : {roc_auc_score(y_test, y_pred_prob)}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Scaling adalah proses yang digunakan untuk mengubah rentang data atau skala fitur (variabel independen) agar berada dalam rentang yang seragam.

Sekarang justru evaluasi menggunakan Naive Bayes lebih tinggi :)

# Predicted Values vs Actual Values Comparison

In [None]:
# 1. Membandingkan Predicted Values dan Actual Values
plt.figure(figsize=(8, 6))

# Scatter plot antara Predicted dan Actual
plt.scatter(y_test, y_pred2, color='blue', label='Data Points', alpha=0.6)
plt.plot([0, 1], [0, 1], color='red', lw=2, label="Ideal Line (y = x)")  # Garis ideal
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted Values vs Actual Values')
plt.legend()
plt.show()

In [None]:
errors = np.sum(y_test != y_pred2)
print(f'Jumlah kesalahan prediksi: {errors} dari {len(y_test)} sampel')

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred2)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix')
plt.show()

Berarti 93 + 53 / (78 + 26 + 34 + 41) => Accuracy
<br>
Predicted Values bener 146 dari 179

## Scaling Linear Regression

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [None]:
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()

In [None]:
y_train = np.reshape(y_train, (-1, 1))
y_test = np.reshape(y_test, (-1, 1))

In [None]:
minScalingX, minScalingY = MinMaxScaler(), MinMaxScaler()
standardScalingX, standardScalingY = StandardScaler(), StandardScaler()

In [None]:
def scaling (x1, x2, y1, y2, scaler_x, scaler_y):
    x1 = scaler_x.fit_transform(x1)
    x2 = scaler_x.transform(x2)
    y1 = scaler_y.fit_transform(y1)
    y2 = scaler_y.transform(y2)
    return x1, x2, y1, y2, scaler_x, scaler_y

In [None]:
x_train3, x_test3, y_train3, y_test3, scaler_x_minmax, scaler_y_minmax = scaling (x_train, x_test, y_train, y_test, minScalingX, minScalingY)

In [None]:
x_train4, x_test4, y_train4, y_test4, scaler_x_standard, scaler_y_standard = scaling (x_train, x_test, y_train, y_test, standardScalingX, standardScalingY)

In [None]:
model = LinearRegression() #LinearRegression dengan MinMaxScaler()
history = model.fit(x_train3, y_train3)
y_pred = model.predict(x_test3)
print(f"RMSE : {root_mean_squared_error(y_test3, y_pred)}, MSE : {mean_squared_error(y_test3, y_pred)}, R2_SCORE : {r2_score(y_test3, y_pred)}")

In [None]:
model = LinearRegression() #LinearRegression dengan StandardScaler
history = model.fit(x_train4, y_train4)
y_pred = model.predict(x_test4)
print(f"RMSE : {root_mean_squared_error(y_test4, y_pred)}, MSE : {mean_squared_error(y_test4, y_pred)}, R2_SCORE : {r2_score(y_test4, y_pred)}")