a. Adult Census Income dataset: https://www.kaggle.com/datasets/uciml/adult-census-income/ - Predict whether the annual income of the person is >=50K or <50K

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", 
           "relationship", "race", "sex", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"]
data = pd.read_csv(url, names=columns, na_values=" ?", skipinitialspace=True)

data = data.dropna()

categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

X = data.drop('income', axis=1)
y = data['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X)
print(y)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

input_size = X_train.shape[1]
hidden1_size = 3
hidden2_size = 2
output_size = 2 

np.random.seed(42)
W1 = np.random.randn(input_size, hidden1_size)
b1 = np.zeros((1, hidden1_size))
W2 = np.random.randn(hidden1_size, hidden2_size)
b2 = np.zeros((1, hidden2_size))
W3 = np.random.randn(hidden2_size, output_size)  
b3 = np.zeros((1, output_size))

epochs = 50
learning_rate = 0.01

for epoch in range(epochs):
    z1 = np.dot(X_train, W1) + b1
    a1 = sigmoid(z1)
    z2 = np.dot(a1, W2) + b2
    a2 = sigmoid(z2)
    z3 = np.dot(a2, W3) + b3
    a3 = softmax(z3)  

    error = a3 - np.eye(output_size)[y_train.values]  
    delta3 = error
    delta2 = np.dot(delta3, W3.T) * a2 * (1 - a2)
    delta1 = np.dot(delta2, W2.T) * a1 * (1 - a1)

    W3 -= learning_rate * np.dot(a2.T, delta3)
    b3 -= learning_rate * np.sum(delta3, axis=0, keepdims=True)
    W2 -= learning_rate * np.dot(a1.T, delta2)
    b2 -= learning_rate * np.sum(delta2, axis=0, keepdims=True)
    W1 -= learning_rate * np.dot(X_train.T, delta1)
    b1 -= learning_rate * np.sum(delta1, axis=0, keepdims=True)

    loss = -np.sum(np.log(a3[np.arange(len(a3)), y_train.values])) / len(a3)
    predictions = np.argmax(a3, axis=1)
    accuracy = np.mean(predictions == y_train.values)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss}, Accuracy: {accuracy}")

z1_test = np.dot(X_test, W1) + b1
a1_test = sigmoid(z1_test)
z2_test = np.dot(a1_test, W2) + b2
a2_test = sigmoid(z2_test)
z3_test = np.dot(a2_test, W3) + b3
a3_test = softmax(z3_test)  

predictions_test = np.argmax(a3_test, axis=1)
count_high_income = np.sum(predictions_test == 1)
count_low_income = np.sum(predictions_test == 0)

print("Number of people predicted to have income >= 50K:", count_high_income)
print("Number of people predicted to have income < 50K:", count_low_income)

accuracy_test = np.mean(predictions_test == y_test.values)
print(f"Test Accuracy: {accuracy_test}")

       age  workclass  fnlwgt  education  education_num  marital_status  \
0       39          7   77516          9             13               4   
1       50          6   83311          9             13               2   
2       38          4  215646         11              9               0   
3       53          4  234721          1              7               2   
4       28          4  338409          9             13               2   
...    ...        ...     ...        ...            ...             ...   
32556   27          4  257302          7             12               2   
32557   40          4  154374         11              9               2   
32558   58          4  151910         11              9               6   
32559   22          4  201490         11              9               4   
32560   52          5  287927         11              9               2   

       occupation  relationship  race  sex  capital_gain  capital_loss  \
0               1        

b. Indian Liver Patient Records dataset: https://www.kaggle.com/datasets/uciml/indian-liver-patient-records - Predict whether person needs to be diagnosed or not?

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00225/Indian%20Liver%20Patient%20Dataset%20(ILPD).csv"
columns = ["age", "gender", "total_bilirubin", "direct_bilirubin", "alkaline_phosphotase",
           "alamine_aminotransferase", "aspartate_aminotransferase", "total_proteins",
           "albumin", "albumin_and_globulin_ratio", "class"]
data = pd.read_csv(url, names=columns)

data = data.dropna()

data['gender'] = data['gender'].map({'Female': 0, 'Male': 1})

data['class'] = data['class'].map({1: 1, 2: 0})  

X = data.drop('class', axis=1)
y = data['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True)) 
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

input_size = X_train.shape[1]
hidden1_size = 4
hidden2_size = 3
output_size = 2  

np.random.seed(42)
W1 = np.random.randn(input_size, hidden1_size)
b1 = np.zeros((1, hidden1_size))
W2 = np.random.randn(hidden1_size, hidden2_size)
b2 = np.zeros((1, hidden2_size))
W3 = np.random.randn(hidden2_size, output_size) 
b3 = np.zeros((1, output_size))

epochs = 50
learning_rate = 0.01

for epoch in range(epochs):
    z1 = np.dot(X_train, W1) + b1
    a1 = sigmoid(z1)
    z2 = np.dot(a1, W2) + b2
    a2 = sigmoid(z2)
    z3 = np.dot(a2, W3) + b3
    a3 = softmax(z3)  

    error = a3 - np.eye(output_size)[y_train.values] 
    delta3 = error
    delta2 = np.dot(delta3, W3.T) * a2 * (1 - a2)
    delta1 = np.dot(delta2, W2.T) * a1 * (1 - a1)

    W3 -= learning_rate * np.dot(a2.T, delta3)
    b3 -= learning_rate * np.sum(delta3, axis=0, keepdims=True)
    W2 -= learning_rate * np.dot(a1.T, delta2)
    b2 -= learning_rate * np.sum(delta2, axis=0, keepdims=True)
    W1 -= learning_rate * np.dot(X_train.T, delta1)
    b1 -= learning_rate * np.sum(delta1, axis=0, keepdims=True)

    loss = -np.sum(np.log(a3[np.arange(len(a3)), y_train.values])) / len(a3)
    predictions = np.argmax(a3, axis=1)
    accuracy = np.mean(predictions == y_train.values)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss}, Accuracy: {accuracy}")

z1_test = np.dot(X_test, W1) + b1
a1_test = sigmoid(z1_test)
z2_test = np.dot(a1_test, W2) + b2
a2_test = sigmoid(z2_test)
z3_test = np.dot(a2_test, W3) + b3
a3_test = softmax(z3_test) 

predictions_test = np.argmax(a3_test, axis=1)

count_liver_patient = np.sum(predictions_test == 1)
count_non_patient = np.sum(predictions_test == 0)

print("Number of liver patients:", count_liver_patient)
print("Number of non-patients:", count_non_patient)

accuracy_test = np.mean(predictions_test == y_test.values)
print(f"Test Accuracy: {accuracy_test}")

Epoch 1/50, Loss: 0.5960136037984728, Accuracy: 0.7432098765432099
Epoch 2/50, Loss: 0.6360378866752362, Accuracy: 0.7432098765432099
Epoch 3/50, Loss: 0.6510690554019846, Accuracy: 0.5876543209876544
Epoch 4/50, Loss: 0.7429335066573597, Accuracy: 0.7432098765432099
Epoch 5/50, Loss: 0.6361733249478516, Accuracy: 0.7308641975308642
Epoch 6/50, Loss: 0.6793555037836688, Accuracy: 0.7432098765432099
Epoch 7/50, Loss: 0.6033245878984416, Accuracy: 0.7432098765432099
Epoch 8/50, Loss: 0.6170804153048269, Accuracy: 0.7432098765432099
Epoch 9/50, Loss: 0.5847097620510424, Accuracy: 0.7432098765432099
Epoch 10/50, Loss: 0.5893521245765484, Accuracy: 0.7432098765432099
Epoch 11/50, Loss: 0.5725645254336691, Accuracy: 0.7432098765432099
Epoch 12/50, Loss: 0.5737357711502369, Accuracy: 0.7432098765432099
Epoch 13/50, Loss: 0.5639292020561114, Accuracy: 0.7432098765432099
Epoch 14/50, Loss: 0.5636589526927909, Accuracy: 0.7432098765432099
Epoch 15/50, Loss: 0.5573034504284666, Accuracy: 0.743209

c. Titanic: Machine Learning from Disaster dataset: https://www.kaggle.com/c/titanic/data - Predict survival on the Titanic

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)

data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex'])
data['Embarked'] = le.fit_transform(data['Embarked'])

data['Survived'] = data['Survived'].astype(int)

X = data.drop('Survived', axis=1)
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

input_size = X_train.shape[1]
hidden1_size = 5
hidden2_size = 4
output_size = 2  

np.random.seed(42)
W1 = np.random.randn(input_size, hidden1_size)
b1 = np.zeros((1, hidden1_size))
W2 = np.random.randn(hidden1_size, hidden2_size)
b2 = np.zeros((1, hidden2_size))
W3 = np.random.randn(hidden2_size, output_size)  
b3 = np.zeros((1, output_size))

epochs = 50
learning_rate = 0.01

for epoch in range(epochs):
    z1 = np.dot(X_train, W1) + b1
    a1 = sigmoid(z1)
    z2 = np.dot(a1, W2) + b2
    a2 = sigmoid(z2)
    z3 = np.dot(a2, W3) + b3
    a3 = softmax(z3) 

    error = a3 - np.eye(output_size)[y_train.values] 
    delta3 = error
    delta2 = np.dot(delta3, W3.T) * a2 * (1 - a2)
    delta1 = np.dot(delta2, W2.T) * a1 * (1 - a1)

    W3 -= learning_rate * np.dot(a2.T, delta3)
    b3 -= learning_rate * np.sum(delta3, axis=0, keepdims=True)
    W2 -= learning_rate * np.dot(a1.T, delta2)
    b2 -= learning_rate * np.sum(delta2, axis=0, keepdims=True)
    W1 -= learning_rate * np.dot(X_train.T, delta1)
    b1 -= learning_rate * np.sum(delta1, axis=0, keepdims=True)

    loss = -np.sum(np.log(a3[np.arange(len(a3)), y_train.values])) / len(a3)
    predictions = np.argmax(a3, axis=1)
    accuracy = np.mean(predictions == y_train.values)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss}, Accuracy: {accuracy}")

z1_test = np.dot(X_test, W1) + b1
a1_test = sigmoid(z1_test)
z2_test = np.dot(a1_test, W2) + b2
a2_test = sigmoid(z2_test)
z3_test = np.dot(a2_test, W3) + b3
a3_test = softmax(z3_test) 

predictions_test = np.argmax(a3_test, axis=1)

count_survived = np.sum(predictions_test == 1)
count_not_survived = np.sum(predictions_test == 0)

print("Number of survivors:", count_survived)
print("Number of non-survivors:", count_not_survived)

accuracy_test = np.mean(predictions_test == y_test.values)
print(f"Test Accuracy: {accuracy_test}")

Epoch 1/50, Loss: 0.6905960872141939, Accuracy: 0.6292134831460674
Epoch 2/50, Loss: 1.2607610094879909, Accuracy: 0.3707865168539326
Epoch 3/50, Loss: 2.9005940072864376, Accuracy: 0.6292134831460674
Epoch 4/50, Loss: 0.6918134282502071, Accuracy: 0.5008025682182986
Epoch 5/50, Loss: 0.8017897053477375, Accuracy: 0.6292134831460674
Epoch 6/50, Loss: 0.9664147778519571, Accuracy: 0.3707865168539326
Epoch 7/50, Loss: 1.4154513842085896, Accuracy: 0.6292134831460674
Epoch 8/50, Loss: 0.813580227176612, Accuracy: 0.3707865168539326
Epoch 9/50, Loss: 1.175945820421995, Accuracy: 0.6292134831460674
Epoch 10/50, Loss: 0.9515572345915776, Accuracy: 0.3707865168539326
Epoch 11/50, Loss: 1.4310529514286678, Accuracy: 0.6292134831460674
Epoch 12/50, Loss: 0.7974459680848754, Accuracy: 0.3707865168539326
Epoch 13/50, Loss: 1.1910192672904818, Accuracy: 0.6292134831460674
Epoch 14/50, Loss: 0.940526214101921, Accuracy: 0.3707865168539326
Epoch 15/50, Loss: 1.4727091354414206, Accuracy: 0.629213483

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
