In [71]:
import random
import pandas as pd
random.seed(69)
dataset = pd.DataFrame(columns=[0, 1])
for i in range(30):
    new_value = ([random.randint(0, 10), random.randint(100, 200)])
    new_row = pd.DataFrame([new_value], columns=dataset.columns)
    dataset = pd.concat([dataset, new_row], ignore_index=True)

In [72]:
print(dataset[:5])
print(f"number of rows: {dataset.shape[0]}")
print(f"number of columns: {dataset.shape[1]}")
print(f"range of each features value: {dataset[0].min()} - {dataset[0].max()}")

    0    1
0  10  104
1   1  121
2   1  177
3   5  141
4   8  153
number of rows: 30
number of columns: 2
range of each features value: 0 - 10


In [73]:
def split_dataset(data, train_ratio):
    train_size = int(len(data) * train_ratio)
    train_data = data[:train_size]
    test_data = data[train_size:]
    return train_data, test_data

ratios = [0.7, 0.8, 0.9]
for ratio in ratios:
    train_data, test_data = split_dataset(dataset, ratio)
    print(f"train size: {train_data.shape[0]}, test size: {test_data.shape[0]}")
    print(f"train data: {train_data[:5]}")
    print(f"test data: {test_data[:5]}")
    print("")

train size: 21, test size: 9
train data:     0    1
0  10  104
1   1  121
2   1  177
3   5  141
4   8  153
test data:     0    1
21  0  108
22  1  127
23  0  118
24  3  131
25  4  119

train size: 24, test size: 6
train data:     0    1
0  10  104
1   1  121
2   1  177
3   5  141
4   8  153
test data:     0    1
24  3  131
25  4  119
26  7  180
27  1  189
28  5  181

train size: 27, test size: 3
train data:     0    1
0  10  104
1   1  121
2   1  177
3   5  141
4   8  153
test data:     0    1
27  1  189
28  5  181
29  4  168



In [78]:
import pandas as pd

data = dataset

X = data[0].tolist()
y = data[1].tolist()

def train_test_split(X, y, test_size):
    split_index = int(len(X) * (1 - test_size))
    return X[:split_index], X[split_index:], y[:split_index], y[split_index:]

def calculate_coefficients(X, y):
    n = len(X)
    mean_x, mean_y = sum(X) / n, sum(y) / n
    SS_xy = sum((y[i] * X[i]) - n * mean_y * mean_x for i in range(n))
    SS_xx = sum((X[i] * X[i])  - n * mean_y * mean_x for i in range(n))
    w1 = SS_xy / SS_xx
    w0 = mean_y - w1 * mean_x
    return w1, w0

def predict(X, w1, w0):
    return [w1 * x + w0 for x in X]

def mean_squared_error(y_true, y_pred):
    return sum((y_true[i] - y_pred[i]) ** 2 for i in range(len(y_true))) / len(y_true)

splits = [0.7, 0.8, 0.9]
best_split = None
best_mse = float('inf')

for split in splits:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split)
    w1, w0 = calculate_coefficients(X_train, y_train)
    y_pred = predict(X_test, w1, w0)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Test size: {split}, MSE: {mse}")
    if mse < best_mse:
        best_mse = mse
        best_split = split

print(f"Best split: {best_split}, Best MSE: {best_mse}")

Test size: 0.7, MSE: 716.8159248495953
Test size: 0.8, MSE: 665.6828932181766
Test size: 0.9, MSE: 1777.790793404864
Best split: 0.8, Best MSE: 665.6828932181766
