In [23]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler

df = pd.read_csv("data/train_data_cl.csv", header=None, names=list(range(13))) #menggunakan path relative
df

df.columns = np.insert(df.iloc[0, :12].values, 4, "FirstName")
df = df.drop(0).reset_index(drop=True)

df

df["Name"] = df["FirstName"] + " " + df["Name"]
df = df.drop("FirstName", axis=1)
df

print(df['Sex'].unique().tolist())
true_sex_values = ['male', 'female']
mask = ~df['Sex'].isin(true_sex_values)
df.loc[mask, 'Sex'] = np.random.choice(true_sex_values, size=mask.sum())

print(df['Sex'].unique().tolist())

df['Fare'] = pd.to_numeric(df['Fare'], errors='coerce').fillna(0)
df['Parch'] = df['Parch'].apply(lambda x: 0 if len(str(x)) > 1 else x)
df.info()

df = df.astype({
    'PassengerId': 'int64',
    'Survived': 'int64',
    'Pclass': 'int64',
    'SibSp': 'int64',
    'Parch': 'int64',
    'Name': 'string',
    'Sex': 'string',
    'Ticket': 'string',
    'Cabin': 'string',
    'Embarked': 'string',
    'Age': 'float64',
    'Fare': 'float64'
})

df.info()

df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)


missing = pd.DataFrame({
    'total': df.isnull().sum(),
    'percent': df.isnull().mean() * 100
})

missing

df['Embarked'] = df['Embarked'].fillna(value=np.random.choice(df['Embarked'].dropna().unique()))

df['Fare'] = df['Fare'].fillna(value=0)

df.isna().sum()


lbenc = LabelEncoder()

for column in df.columns:
    if df[column].dtype == 'string':
        df[column] = lbenc.fit_transform(df[column])

df.head()

train = df.copy()
x_train = train.iloc[:, 2:].values
y_train = df.iloc[:, 1].values

feature = x_train.copy()

mnmx = MinMaxScaler()
feature = mnmx.fit_transform(feature)

print(feature)

['male', 'female', '15', '27', '9', '36.5', '16', '40', '45', '24', '18', '20.5', '8', '26', '3', '1', '19', '21', '36', '22', '48', '49', '35', '39', '6']
['male', 'female']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    object 
 1   Survived     712 non-null    object 
 2   Pclass       712 non-null    object 
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          712 non-null    object 
 6   SibSp        712 non-null    object 
 7   Parch        712 non-null    object 
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        210 non-null    object 
 11  Embarked     677 non-null    object 
dtypes: float64(1), object(11)
memory usage: 66.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data c

In [24]:
from sklearn.model_selection import train_test_split

# Assuming 'Survived' is your target variable
X = feature  # Features
y = y_train   # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (569, 7)
X_test shape: (143, 7)
y_train shape: (569,)
y_test shape: (143,)


In [25]:
from random import randrange

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm without cross-validation(?) ato pake cross-validation ya?
def evaluate_algorithm(train_set, test_set, algorithm, *args):
    actual_train = [row[-1] for row in train_set]
    predicted_train = algorithm(train_set, train_set, *args)
    accuracy_train = accuracy_metric(actual_train, predicted_train)

    actual_test = [row[-1] for row in test_set]
    predicted_test = algorithm(train_set, test_set, *args)
    accuracy_test = accuracy_metric(actual_test, predicted_test)

    return accuracy_train, accuracy_test

# Make a prediction with weights
def predict(row, weights):
    activation = weights[0]
    for i in range(len(row)-1):
        activation += weights[i + 1] * row[i]
    return 1.0 if activation >= 0.0 else 0.0

# Estimate Perceptron weights using stochastic gradient descent
def train_weights(train, l_rate, n_epoch):
    weights = [0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        for row in train:
            prediction = predict(row, weights)
            error = row[-1] - prediction
            weights[0] = weights[0] + l_rate * error
            for i in range(len(row)-1):
                weights[i + 1] = weights[i + 1] + l_rate * error * row[i]
        
        # Print accuracy after each epoch
        train_predictions = [predict(row, weights) for row in train]
        train_actual = [row[-1] for row in train]
        train_accuracy = accuracy_metric(train_actual, train_predictions)
        print(f'Epoch {epoch + 1}/{n_epoch}: Training Accuracy = {train_accuracy:.2f}%')
    
    return weights

# Perceptron Algorithm With Stochastic Gradient Descent
def perceptron(train, test, l_rate, n_epoch):
    predictions = list()
    weights = train_weights(train, l_rate, n_epoch)
    for row in test:
        prediction = predict(row, weights)
        predictions.append(prediction)
    return predictions


In [26]:
# Combine X_train and y_train into a training dataset
train_dataset = [list(X_train[i]) + [y_train[i]] for i in range(len(X_train))]

# Combine X_test and y_test into a test dataset
test_dataset = [list(X_test[i]) + [y_test[i]] for i in range(len(X_test))]

# Set the learning rate and number of epochs
learning_rate = 0.01
epochs = 100

# Example: Using the perceptron algorithm without cross-validation
accuracy_train, accuracy_test = evaluate_algorithm(train_dataset, test_dataset, perceptron, learning_rate, epochs)

# Print the accuracy scores
print(f'Training Accuracy: {accuracy_train:.2f}%')
print(f'Testing Accuracy: {accuracy_test:.2f}%')


Epoch 1/100: Training Accuracy = 66.08%
Epoch 2/100: Training Accuracy = 71.70%
Epoch 3/100: Training Accuracy = 81.20%
Epoch 4/100: Training Accuracy = 79.26%
Epoch 5/100: Training Accuracy = 79.44%
Epoch 6/100: Training Accuracy = 62.39%
Epoch 7/100: Training Accuracy = 81.02%
Epoch 8/100: Training Accuracy = 79.26%
Epoch 9/100: Training Accuracy = 81.20%
Epoch 10/100: Training Accuracy = 72.06%
Epoch 11/100: Training Accuracy = 81.02%
Epoch 12/100: Training Accuracy = 60.98%
Epoch 13/100: Training Accuracy = 80.67%
Epoch 14/100: Training Accuracy = 81.20%
Epoch 15/100: Training Accuracy = 82.60%
Epoch 16/100: Training Accuracy = 80.14%
Epoch 17/100: Training Accuracy = 81.20%
Epoch 18/100: Training Accuracy = 81.90%
Epoch 19/100: Training Accuracy = 80.67%
Epoch 20/100: Training Accuracy = 70.12%
Epoch 21/100: Training Accuracy = 78.56%
Epoch 22/100: Training Accuracy = 80.32%
Epoch 23/100: Training Accuracy = 80.14%
Epoch 24/100: Training Accuracy = 81.37%
Epoch 25/100: Training Ac