# Import Library

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import recall_score, f1_score, accuracy_score, mean_squared_error, confusion_matrix, precision_score, roc_auc_score, root_mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

# Read Data

In [None]:
data_train = pd.read_csv("Data/train.csv")
data_test = pd.read_csv("Data/test.csv")

In [None]:
data_train


In [None]:
data_out = data_train.pop("Survived")
data_train["Survived"] = data_out
data_train

In [None]:
data_train = data_train.drop(columns=["Name"])

In [None]:
data_train

# EDA

In [None]:
data_train.dtypes

In [None]:
data_train.describe()

In [None]:
data_train.shape

In [None]:
for column in data_train.columns[:-1]:
    sns.histplot(data=data_train, x=column)
    plt.show()

In [None]:
data_train.columns

In [None]:
for column in data_train.columns[:-1]:
    sns.boxplot(data=data_train, x=column)
    plt.show()

In [None]:
for column in data_train.columns[:-1]:
    if (data_train[column].dtype != "object"):
        sns.scatterplot(data=data_train, x=column, y="Survived")
        plt.show()

In [None]:
for column in data_train.columns[:-1]:
    if (data_train[column].dtype != "object"):
        print (f"Column {column}, Skewness: {data_train[column].skew()}, Kurtosis: {data_train[column].kurt()}")

# Data Preprocessing

In [None]:
data_train.isna()

In [None]:
data_train.isna().sum()

In [None]:
missing_value_percentage = data_train.isna().sum() / len(data_train) * 100
print (missing_value_percentage)

In [None]:
for key, value in missing_value_percentage.items():
    if (value > 50):
        data_train = data_train.drop(columns=key, axis=1)

In [None]:
data_train

In [None]:
data_train = data_train.drop(columns="Ticket")

In [None]:
data_train

In [None]:
data_train.isna().sum()

In [None]:
for column in data_train.columns:
    if (data_train[column].isna().any()):
        if (data_train[column].dtype == "object"):
            data_train[column] = data_train[column].fillna(data_train[column].mode()[0])
        else:
            data_train[column] = data_train[column].fillna(data_train[column].median())

In [None]:
data_train.isna().sum()

In [None]:
data_train

# Modelling

In [None]:
data_train.dtypes

In [None]:
encoder = LabelEncoder()
encoded = {}

for column in data_train.columns:
    if (data_train[column].dtype == "object"):
        data_train[column] = encoder.fit_transform(data_train[column])
        encoded[column] = {i:class_name for i, class_name in enumerate(encoder.classes_)}

In [None]:
print (encoded)

In [None]:
for key, value in encoded.items():
    print (f"Key: {key} => Value {value}")

In [None]:
data_train.dtypes

In [None]:
sns.heatmap(data_train[:-1].corr(), annot=True, cmap='coolwarm')
plt.show

In [None]:
sns.pairplot(data=data_train)

In [None]:
x = data_train[data_train.columns[:-1]]
x

In [None]:
y = data_train[data_train.columns[-1]]
y

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
x_train

In [None]:
y_train

In [None]:
model = LinearRegression()
model2 = LogisticRegression()
model.fit(X=x_train, y=y_train)
model2.fit(X=x_train, y=y_train)

In [None]:
y_pred = model.predict(x_test)
y_pred2 = model2.predict(x_test)

# Evaluation

In [None]:
y_test

In [None]:
y_pred2

In [None]:
mse = mean_squared_error(y_pred2, y_test)
a = accuracy_score(y_pred2, y_test)
p = precision_score(y_pred2, y_test)
r = recall_score(y_pred2, y_test)
roc = roc_auc_score(y_pred2, y_test)

print (f"MSE: {mse}")
print (f"Accuracy: {a}")
print (f"Precission: {p}")
print (f"Recall: {r}")
print (f"ROC AUC Score: {roc}")


In [None]:
mse = mean_squared_error(y_pred, y_test)
a = root_mean_squared_error(y_pred, y_test)
r2 = r2_score(y_pred, y_test)
print (f"MSE: {mse}")
print (f"RMSE: {a}")
print (f"R2 Score: {r2}")

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='green', label='Data Points')
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', lw=2, label="Ideal Line (y = x)")
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted Values vs Actual Values')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Misalnya kita sudah memiliki y_test dan y_pred dari model prediksi
plt.figure(figsize=(8, 6))

# Menampilkan titik actual dengan warna biru
plt.scatter(y_test, y_test, color='blue', label='Actual Values', alpha=0.8)

# Menampilkan titik predicted dengan warna hijau
plt.scatter(y_test, y_pred, color='red', label='Predicted Values', alpha=0.6)

# Menambahkan garis regresi y = x (garis ideal)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2, label="Ideal Line (y = x)")

# Menambahkan label dan judul
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted vs Actual Values with Different Colors')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(8, 6))

# Plot untuk actual values
plt.plot(y_test.values, label='Actual Values', color='blue', marker='o')

# Plot untuk predicted values
plt.plot(y_pred, label='Predicted Values', color='green', marker='x')

# Menambahkan label dan judul
plt.xlabel('Index')
plt.ylabel('Values')
plt.title('Actual vs Predicted Values (Line Plot)')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(8, 6))

# Residuals (selisih antara nilai aktual dan nilai prediksi)
residuals = y_test - y_pred

# Plot residuals
plt.scatter(y_test, residuals, color='orange', alpha=0.6)

# Menambahkan garis horizontal pada 0 untuk menunjukkan perbedaan
plt.axhline(y=0, color='red', linestyle='--')

# Menambahkan label dan judul
plt.xlabel('Actual Values')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title('Residual Plot')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))

# Plot histogram untuk residuals
plt.hist(residuals, bins=30, color='purple', edgecolor='black', alpha=0.7)

# Menambahkan label dan judul
plt.xlabel('Residuals (Actual - Predicted)')
plt.ylabel('Frequency')
plt.title('Histogram of Residuals')
plt.show()
