In [1]:
!pip install river



In [2]:
%load_ext autoreload
%autoreload 2


In [5]:
from river import compose
from river import linear_model
from river import metrics
from river import evaluate
from river import preprocessing
from river import optim

from pprint import pprint
from river import datasets

dataset = datasets.Bikes()


model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind')
model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression(optimizer=optim.SGD(0.001))

metric = metrics.MAE()

evaluate.progressive_val_score(dataset, model, metric, print_every=20_000)


[20,000] MAE: 4.912727
[40,000] MAE: 5.333554
[60,000] MAE: 5.330948
[80,000] MAE: 5.392313
[100,000] MAE: 5.423059
[120,000] MAE: 5.541223
[140,000] MAE: 5.613023
[160,000] MAE: 5.622428
[180,000] MAE: 5.567824


MAE: 5.563893

What I learnt and concepts I don't understand:
- compose.Select: The parameters are put into a set and the columns of the features included in the set will be extracted
- preprocessing.StandardScaler: Each object in the class will take 4 parameters (A boolean to check if it is neccessary to get std equal 1, a counter to keep track of the number of datapoints learnt, a mean and a variance). Then learn_one will update the mean and the variance (if the boolean is True) while learn_many will update the mean and variance of each feature using itertool_zip
- linear reg: There are severals things I don't understand
    + Intercept_init and the _super() part in the class?
    + self.loss.mean_func(self._raw_dot_one(x)). Parameters that the loss function takes? Mean_func?
    + No learn_one function?
    
- MAE : Mean Absolute Error
- Progressive_val_score:
    + Metric in accordance with model?
    + utils.inspect?
    + next_checkpoint?
    
- Simualte_qa: Show datapoints in order of arrival




In [8]:
next(iter(dataset))

({'moment': datetime.datetime(2016, 4, 1, 0, 0, 7),
  'station': 'metro-canal-du-midi',
  'clouds': 75,
  'description': 'light rain',
  'humidity': 81,
  'pressure': 1017.0,
  'temperature': 6.54,
  'wind': 9.3},
 1)

In [6]:
from river import feature_extraction
from river import stats

def get_hour(x):
    x['hour'] = x['moment'].hour
    return x

model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind')
model += (
    get_hour |
    feature_extraction.TargetAgg(by=['station', 'hour'], how=stats.Mean())
)
model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression(optimizer=optim.SGD(0.001))

metric = metrics.MAE()

evaluate.progressive_val_score(dataset, model, metric, print_every=20_000)


[20,000] MAE: 3.721246
[40,000] MAE: 3.829972
[60,000] MAE: 3.845068
[80,000] MAE: 3.910259
[100,000] MAE: 3.888652
[120,000] MAE: 3.923727
[140,000] MAE: 3.980953
[160,000] MAE: 3.950034
[180,000] MAE: 3.934545


MAE: 3.933498

In [9]:
from pprint import pprint
from river import datasets

dataset = datasets.Bikes()

def get_hour(x):
    x['hour'] = x['moment'].hour
    return x


print(type(dataset))
cache_weather = set()
for x, y in dataset:
    if x['description'] not in cache_weather:
        cache_weather.add(x['description'])
print(cache_weather)
print(len(cache_weather))
weather_list = list(cache_weather)
print(weather_list)


def get_weather(x):
    if x['description'] in {'light intensity drizzle rain', 'very heavy rain', 'moderate rain', 'drizzle', 'thunderstorm with heavy rain', 'heavy intensity rain', 'mist', 'overcast clouds', 'light intensity drizzle', 'thunderstorm with light rain', 'thunderstorm', 'thunderstorm with rain', 'light rain'}:
        x['rain'] = 2
    else:
        x['rain'] = 1
    return x

cache_station = set()
for x, y in dataset:
    if x['station'] not in cache_station:
        cache_station.add(x['station'])
# print(cache_station)
# print(len(cache_station))
station_list = list(cache_station)
# print(station_list)

def get_station(x):
    for station in station_list:
        if x['station'] == station:
            x[station] = 1
        else:
            x[station] = 0
    return x

def add_feature(x):
    get_weather(x)
    return x

model = add_feature
model |= compose.Discard('station', 'description', 'moment')
model += (
    get_hour |
    feature_extraction.TargetAgg(by=['station', 'hour'], how=stats.Mean())
)

model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression(optimizer=optim.SGD(0.001))

metric = metrics.MAE()

evaluate.progressive_val_score(dataset, model, metric, print_every=20_000)

<class 'river.datasets.bikes.Bikes'>
{'light intensity drizzle rain', 'scattered clouds', 'broken clouds', 'overcast clouds', 'mist', 'moderate rain', 'clear sky', 'light rain', 'light intensity drizzle', 'thunderstorm with heavy rain', 'very heavy rain', 'drizzle', 'Sky is Clear', 'heavy intensity rain', 'few clouds', 'thunderstorm with rain', 'thunderstorm with light rain', 'thunderstorm'}
18
['light intensity drizzle rain', 'scattered clouds', 'broken clouds', 'overcast clouds', 'mist', 'moderate rain', 'clear sky', 'light rain', 'light intensity drizzle', 'thunderstorm with heavy rain', 'very heavy rain', 'drizzle', 'Sky is Clear', 'heavy intensity rain', 'few clouds', 'thunderstorm with rain', 'thunderstorm with light rain', 'thunderstorm']
[20,000] MAE: 3.723647
[40,000] MAE: 3.831113
[60,000] MAE: 3.847153
[80,000] MAE: 3.911021
[100,000] MAE: 3.888753
[120,000] MAE: 3.924125
[140,000] MAE: 3.981258
[160,000] MAE: 3.949619
[180,000] MAE: 3.93391


MAE: 3.932767

In [None]:
next(iter(dataset))

In [None]:
print(model.debug_one(x))

We want to make use of the weather feature so a naive method to convert classified values into numbers is to do seperate the classes into 2 and assign 1 to a group and 0 to the other. Here, there are 18 'description' classes and I classified them by rain and not rain.
By adding the feature weather, we see that the error decrease a little.

In [10]:
from pprint import pprint
from river import datasets

dataset = datasets.Bikes()

def get_hour(x):
    x['hour'] = x['moment'].hour
    return x


print(type(dataset))
cache_weather = set()
for x, y in dataset:
    if x['description'] not in cache_weather:
        cache_weather.add(x['description'])
# print(cache_weather)
# print(len(cache_weather))
weather_list = list(cache_weather)
print(weather_list)


def get_weather(x):
    if x['description'] in {'light intensity drizzle rain', 'very heavy rain', 'moderate rain', 'drizzle', 'thunderstorm with heavy rain', 'heavy intensity rain', 'mist', 'overcast clouds', 'light intensity drizzle', 'thunderstorm with light rain', 'thunderstorm', 'thunderstorm with rain', 'light rain'}:
        x['rain'] = 2
    else:
        x['rain'] = 1
    return x

cache_station = set()
for x, y in dataset:
    if x['station'] not in cache_station:
        cache_station.add(x['station'])
# print(cache_station)
# print(len(cache_station))
station_list = list(cache_station)
# print(station_list)

def get_station(x):
    for station in station_list:
        if x['station'] == station:
            x[station] = 1
        else:
            x[station] = 0
    return x

def add_feature(x):
    get_weather(x)
    get_station(x)
    return x

model = add_feature
model |= compose.Discard('station', 'description', 'moment')
model += (
    get_hour |
    feature_extraction.TargetAgg(by=['station', 'hour'], how=stats.Mean())
)

model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression(optimizer=optim.SGD(0.001))

metric = metrics.MAE() 

evaluate.progressive_val_score(dataset, model, metric, print_every=20_000)

<class 'river.datasets.bikes.Bikes'>
['light intensity drizzle rain', 'scattered clouds', 'broken clouds', 'overcast clouds', 'mist', 'moderate rain', 'clear sky', 'light rain', 'light intensity drizzle', 'thunderstorm with heavy rain', 'very heavy rain', 'drizzle', 'Sky is Clear', 'heavy intensity rain', 'few clouds', 'thunderstorm with rain', 'thunderstorm with light rain', 'thunderstorm']
[20,000] MAE: 2.828958
[40,000] MAE: 2.920016
[60,000] MAE: 3.001646
[80,000] MAE: 3.097058
[100,000] MAE: 3.12772
[120,000] MAE: 3.126179
[140,000] MAE: 3.127185
[160,000] MAE: 3.126707
[180,000] MAE: 3.138914


MAE: 3.139136

In [11]:
model

Since there are only 5 stations and they share no common features, I set 5 new features and assign 1 if the location it is in x
and the other 4 get 0.
By adding the location feature, we see that the error get much lower.

In [None]:
import itertools

model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind')
model += (
    get_hour |
    feature_extraction.TargetAgg(by=['station', 'hour'], how=stats.Mean())
)
model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression()

for x, y in itertools.islice(dataset, 10000):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)

x, y = next(iter(dataset))
print(model.debug_one(x))


In [None]:
import datetime as dt

evaluate.progressive_val_score(
    dataset=dataset,
    model=model.clone(),
    metric=metrics.MAE(),
    moment='moment',
    delay=dt.timedelta(minutes=30),
    print_every=20_000
)


R2 score:
- Coefficient of determination: Basic idea is to see the ratio between square error of the line and square error of the labels. The smaller the square error of the line is, the better the model.
- Update by first updating the variance, then the square error of the line
    + sample_weight?
    




In [13]:
model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind')
model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression(optimizer=optim.SGD(0.001))

metric = metrics.R2() + metrics.RMSE() + metrics.MAE()

evaluate.progressive_val_score(dataset, model, metric, print_every=20_000)


[20,000] R2: 0.389708, RMSE: 6.578071, MAE: 4.912727
[40,000] R2: 0.368931, RMSE: 6.953025, MAE: 5.333554
[60,000] R2: 0.364829, RMSE: 6.940436, MAE: 5.330948
[80,000] R2: 0.356374, RMSE: 7.047605, MAE: 5.392313
[100,000] R2: 0.352245, RMSE: 7.072087, MAE: 5.423059
[120,000] R2: 0.344607, RMSE: 7.176361, MAE: 5.541223
[140,000] R2: 0.336937, RMSE: 7.248915, MAE: 5.613023
[160,000] R2: 0.336293, RMSE: 7.28009, MAE: 5.622428
[180,000] R2: 0.341127, RMSE: 7.251344, MAE: 5.567824


R2: 0.341716, RMSE: 7.247396, MAE: 5.563893

With out the existence of time feature, the metric score is quite low (far from 1).

In [None]:
model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind')
model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression(optimizer=optim.SGD(0.001))

metric = metrics.MSE()

evaluate.progressive_val_score(dataset, model, metric, print_every=20_000)

The RMSE increases as we add datapoints which means we should add features or extract polynomial degree features. 

In [None]:
from river import feature_extraction
model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind')
model |= feature_extraction.PolynomialExtender(interaction_only=True)
model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression(optimizer=optim.SGD(0.001))

metric1 = metrics.MAE()
metric2 = metrics.RMSE()

evaluate.progressive_val_score(dataset, model, metric1, print_every=20_000)
evaluate.progressive_val_score(dataset, model, metric2, print_every=20_000)

Adding the polynomial features help decrease both RMSE and R2.

In [14]:
from pprint import pprint
from river import datasets

dataset = datasets.Bikes()

def get_hour(x):
    x['hour'] = x['moment'].hour
    return x


print(type(dataset))
cache_weather = set()
for x, y in dataset:
    if x['description'] not in cache_weather:
        cache_weather.add(x['description'])
# print(cache_weather)
# print(len(cache_weather))
weather_list = list(cache_weather)
print(weather_list)


def get_weather(x):
    if x['description'] in {'light intensity drizzle rain', 'very heavy rain', 'moderate rain', 'drizzle', 'thunderstorm with heavy rain', 'heavy intensity rain', 'mist', 'overcast clouds', 'light intensity drizzle', 'thunderstorm with light rain', 'thunderstorm', 'thunderstorm with rain', 'light rain'}:
        x['rain'] = 2
    else:
        x['rain'] = 1
    return x

cache_station = set()
for x, y in dataset:
    if x['station'] not in cache_station:
        cache_station.add(x['station'])
# print(cache_station)
# print(len(cache_station))
station_list = list(cache_station)
# print(station_list)

def get_station(x):
    for station in station_list:
        if x['station'] == station:
            x[station] = 1
        else:
            x[station] = 0
    return x

def add_feature(x):
    get_weather(x)
    get_station(x)
    return x

model = add_feature
model |= compose.Discard('station', 'description', 'moment')
model += (
    get_hour |
    feature_extraction.TargetAgg(by=['station', 'hour'], how=stats.Mean())
)
model |= feature_extraction.PolynomialExtender(interaction_only=True)
model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression(optimizer=optim.SGD(0.001))

metric = metrics.MAE() + metrics.R2()

evaluate.progressive_val_score(dataset, model, metric, print_every=20_000)

<class 'river.datasets.bikes.Bikes'>
['light intensity drizzle rain', 'scattered clouds', 'broken clouds', 'overcast clouds', 'mist', 'moderate rain', 'clear sky', 'light rain', 'light intensity drizzle', 'thunderstorm with heavy rain', 'very heavy rain', 'drizzle', 'Sky is Clear', 'heavy intensity rain', 'few clouds', 'thunderstorm with rain', 'thunderstorm with light rain', 'thunderstorm']
[20,000] MAE: 1.645678, R2: 0.908382
[40,000] MAE: 1.657413, R2: 0.924473
[60,000] MAE: 1.68982, R2: 0.923312
[80,000] MAE: 1.766984, R2: 0.919163
[100,000] MAE: 1.824618, R2: 0.914432
[120,000] MAE: 1.843591, R2: 0.915107
[140,000] MAE: 1.870782, R2: 0.91404
[160,000] MAE: 1.896705, R2: 0.912455
[180,000] MAE: 1.913381, R2: 0.910911


MAE: 1.915846, R2: 0.910813

As we combine both feature addition and polynomial extension, the error is approximately 2, which is much better than the original model.

In [15]:
from pprint import pprint
from river import datasets
from river import model_selection
from river import linear_model

dataset = datasets.Bikes()

def get_hour(x):
    x['hour'] = x['moment'].hour
    return x


print(type(dataset))
cache_weather = set()
for x, y in dataset:
    if x['description'] not in cache_weather:
        cache_weather.add(x['description'])
# print(cache_weather)
# print(len(cache_weather))
weather_list = list(cache_weather)
print(weather_list)


def get_weather(x):
    if x['description'] in {'light intensity drizzle rain', 'very heavy rain', 'moderate rain', 'drizzle', 'thunderstorm with heavy rain', 'heavy intensity rain', 'mist', 'overcast clouds', 'light intensity drizzle', 'thunderstorm with light rain', 'thunderstorm', 'thunderstorm with rain', 'light rain'}:
        x['rain'] = 2
    else:
        x['rain'] = 1
    return x

cache_station = set()
for x, y in dataset:
    if x['station'] not in cache_station:
        cache_station.add(x['station'])
# print(cache_station)
# print(len(cache_station))
station_list = list(cache_station)
# print(station_list)

def get_station(x):
    for station in station_list:
        if x['station'] == station:
            x[station] = 1
        else:
            x[station] = 0
    return x

def add_feature(x):
    get_weather(x)
    get_station(x)
    return x

models = [linear_model.LinearRegression(optimizer=optim.SGD(lr=lr)) for lr in [0.0001, 0.001, 1e-05, 0.01]]


model = add_feature
model |= compose.Discard('station', 'description', 'moment')
model += (
    get_hour |
    feature_extraction.TargetAgg(by=['station', 'hour'], how=stats.Mean())
)
model |= feature_extraction.PolynomialExtender(interaction_only=True)
model |= preprocessing.StandardScaler()
model |= model_selection.EpsilonGreedyRegressor(models,epsilon=0.1,decay=0.001,burn_in=100,seed=1)

metric = metrics.MSE()

evaluate.progressive_val_score(dataset, model, metric, print_every=20_000)

<class 'river.datasets.bikes.Bikes'>
['light intensity drizzle rain', 'scattered clouds', 'broken clouds', 'overcast clouds', 'mist', 'moderate rain', 'clear sky', 'light rain', 'light intensity drizzle', 'thunderstorm with heavy rain', 'very heavy rain', 'drizzle', 'Sky is Clear', 'heavy intensity rain', 'few clouds', 'thunderstorm with rain', 'thunderstorm with light rain', 'thunderstorm']
[20,000] MSE: 6.70268
[40,000] MSE: 5.890454
[60,000] MSE: 5.884922
[80,000] MSE: 6.290002
[100,000] MSE: 6.648304
[120,000] MSE: 6.705402
[140,000] MSE: 6.841905
[160,000] MSE: 7.016975
[180,000] MSE: 7.133147


MSE: 7.13924

In [None]:
from pprint import pprint
from river import datasets
from river import model_selection
from river import linear_model

dataset = datasets.Bikes()

def get_hour(x):
    x['hour'] = x['moment'].hour
    return x


print(type(dataset))
cache_weather = set()
for x, y in dataset:
    if x['description'] not in cache_weather:
        cache_weather.add(x['description'])
# print(cache_weather)
# print(len(cache_weather))
weather_list = list(cache_weather)
print(weather_list)


def get_weather(x):
    if x['description'] in {'light intensity drizzle rain', 'very heavy rain', 'moderate rain', 'drizzle', 'thunderstorm with heavy rain', 'heavy intensity rain', 'mist', 'overcast clouds', 'light intensity drizzle', 'thunderstorm with light rain', 'thunderstorm', 'thunderstorm with rain', 'light rain'}:
        x['rain'] = 2
    else:
        x['rain'] = 1
    return x

cache_station = set()
for x, y in dataset:
    if x['station'] not in cache_station:
        cache_station.add(x['station'])
# print(cache_station)
# print(len(cache_station))
station_list = list(cache_station)
# print(station_list)

def get_station(x):
    for station in station_list:
        if x['station'] == station:
            x[station] = 1
        else:
            x[station] = 0
    return x

def add_feature(x):
    get_weather(x)
    get_station(x)
    return x

models = [linear_model.LinearRegression(optimizer=optim.SGD(lr=lr)) for lr in [0.0001, 0.001, 1e-05, 0.01]]
metric = metrics.MAE()

model = add_feature
model |= compose.Discard('station', 'description', 'moment')
model += (
    get_hour |
    feature_extraction.TargetAgg(by=['station', 'hour'], how=stats.Mean())
)
model |= feature_extraction.PolynomialExtender(interaction_only=True)
model |= preprocessing.StandardScaler()
model |= model_selection.GreedyRegressor(models, metric)

metric = metrics.MSE()

evaluate.progressive_val_score(dataset, model, metric, print_every=20_000)

In [None]:
from pprint import pprint
from river import datasets
from river import model_selection
from river import linear_model
from river import neural_net as nn

dataset = datasets.Bikes()

def get_hour(x):
    x['hour'] = x['moment'].hour
    return x


print(type(dataset))
cache_weather = set()
for x, y in dataset:
    if x['description'] not in cache_weather:
        cache_weather.add(x['description'])
# print(cache_weather)
# print(len(cache_weather))
weather_list = list(cache_weather)
# print(weather_list)


def get_weather(x):
    if x['description'] in {'light intensity drizzle rain', 'very heavy rain', 'moderate rain', 'drizzle', 'thunderstorm with heavy rain', 'heavy intensity rain', 'mist', 'overcast clouds', 'light intensity drizzle', 'thunderstorm with light rain', 'thunderstorm', 'thunderstorm with rain', 'light rain'}:
        x['rain'] = 2
    else:
        x['rain'] = 1
    return x

cache_station = set()
for x, y in dataset:
    if x['station'] not in cache_station:
        cache_station.add(x['station'])
# print(cache_station)
# print(len(cache_station))
station_list = list(cache_station)
# print(station_list)

def get_station(x):
    for station in station_list:
        if x['station'] == station:
            x[station] = 1
        else:
            x[station] = 0
    return x

def add_feature(x):
    get_weather(x)
    get_station(x)
    return x

models = [linear_model.LinearRegression(optimizer=optim.SGD(lr=lr)) for lr in [0.0001, 0.001, 1e-05, 0.01]]
metric = metrics.MAE()

model = add_feature
model |= compose.Discard('station', 'description', 'moment')
model += (
    get_hour |
    feature_extraction.TargetAgg(by=['station', 'hour'], how=stats.Mean())
)
model |= feature_extraction.PolynomialExtender(interaction_only=True)
model |= preprocessing.StandardScaler()
model |= nn.MLPRegressor(hidden_dims=(5,), activations=(nn.activations.ReLU, nn.activations.ReLU, nn.activations.Identity)

metric = metrics.MSE()

evaluate.progressive_val_score(dataset, model, metric, print_every=20_000)