In [None]:
import sklearn
print(sklearn.__file__)
print(open(sklearn.__file__).readlines()[:2])
assert "yifrach was here" in open(sklearn.__file__).read(100), """This is not part of the exercise,
so if this assertion fails, then please send Yifrach the print ASAP"""

# Generating the data

In [None]:
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
x = np.arange(0, 30, 0.000002)
y = np.sin(x) + np.random.normal(size=x.shape[0], scale=0.4)

data = pd.DataFrame({'x': x, 'y': y})

data.plot.scatter('x', 'y', alpha=0.005)
plt.plot(x, np.sin(x), color='red')

# Feature engineering

In [None]:

import math
k = 6
for i in range(1, k):
    data[f'x{i}'] = (data['x'] % (math.pi * 2)) ** i
features = [f'x{i}' for i in range(1, k)]


# Construct a linear model

In [None]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(data[features], data[['y']])

# Sanity check the model

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(data['x'], reg.predict(data[features]))
plt.plot(data['x'], np.sin(data['x']))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model

# Assert that score improves as we increase the train set size from 20 to 80

In [None]:
data_sample = data.sample(n=100_000, random_state=123)

In [None]:
def get_mean_score(train_size: int) -> float:
    current_scores = []
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(data_sample, data_sample['y'], train_size=train_size,
                                                            random_state=train_size * 100 + i)
        reg = linear_model.LinearRegression()
        reg.fit(X_train[features], y_train)
        current_scores.append(reg.score(X_test[features], y_test))
    return np.mean(current_scores)    

In [None]:
import joblib

In [None]:
mean_scores = joblib.Parallel(n_jobs=4, backend='loky')(
[joblib.delayed(get_mean_score)(train_size) for train_size in range(20, 80, 15)])

In [None]:
# More training data should yield better scores (on average) on the test data
assert pd.Series(mean_scores).round(3).is_monotonic_increasing

# Try learning from the range [0, PI] to the range [PI, 2 * PI]

In [None]:
l = X_train, X_test, y_train, y_test = train_test_split(data, data['y'], train_size=0.8, random_state=123)
# Let's if we can learn from the range [0, PI] to the range [PI, 2*PI]
rows_to_drop_from_train = data.index[(data["x"] % (2 * math.pi)).between(math.pi, 2 * math.pi)]
X_train = X_train.drop(labels=rows_to_drop_from_train, errors='ignore')
y_train = y_train.drop(labels=rows_to_drop_from_train, errors='ignore')
X_test = X_test.loc[X_test.index.intersection(rows_to_drop_from_train)]
y_test = y_test.loc[y_test.index.intersection(rows_to_drop_from_train)]

In [None]:
reg = linear_model.LinearRegression()
reg.fit(X_train[features], y_train)

In [None]:
# If in training we see only values in the range [0, PI], it can't be that the model generalizes well
# to the range [PI, 2 * PI] - so we expect a really bad score on the test set
assert reg.score(X_test[features], y_test) < 0