In [1]:
!pip install -q openml


[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import argparse
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision
from torchvision.transforms import Compose, ToTensor, Resize
from torch import optim
import numpy as np
import pandas as pd
from torch.hub import tqdm
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

load_dotenv()

## uncomment if you want to ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [1]:
from datasets import OpenmlDataset, OpenmlDatasetLoader
from datasets import dataset_train_test_split
from preencoder import PreEncoder 
import numpy as np
from models import ToyModel          
### example task_ids ###

# 361066 bank-marketing
# 361076 wine_quality
# 361085 sulfur
# 361088 superconduct
# 361089 california
# 361110 electricity
# 361111 eye_movements
# 361112 KDDCup09_upselling
# 361114 rl
# 361116 compass
# 361099 Bike_Sharing_Demand
# 361102 house_sales

task_id =  361099 #361076

dataset_loader = OpenmlDatasetLoader()
dataset = dataset_loader.load(task_id)
dataset.print_info()


-------------------------------
Dataset name: Bike_Sharing_Demand
n_samples: 17379
m_features: 11 (including 5 categorical features)
Task: regression
-------------------------------


In [2]:
dataset.X

Unnamed: 0,season,year,month,hour,holiday,workingday,weather,temp,feel_temp,humidity,windspeed
0,1,0,1,0,0,0,0,9.84,14.395,0.81,0.0000
1,1,0,1,1,0,0,0,9.02,13.635,0.80,0.0000
2,1,0,1,2,0,0,0,9.02,13.635,0.80,0.0000
3,1,0,1,3,0,0,0,9.84,14.395,0.75,0.0000
4,1,0,1,4,0,0,0,9.84,14.395,0.75,0.0000
...,...,...,...,...,...,...,...,...,...,...,...
17374,1,1,12,19,0,1,2,10.66,12.880,0.60,11.0014
17375,1,1,12,20,0,1,2,10.66,12.880,0.60,11.0014
17376,1,1,12,21,0,1,0,10.66,12.880,0.60,11.0014
17377,1,1,12,22,0,1,0,10.66,13.635,0.56,8.9981


In [9]:
# comme nous sommes en static embedding, on prefere avoir un set de valeurs de chaque colonne

features = {}
for i in dataset.X.columns :
    features[i] = set(dataset.X[i].values)


Les "phrases" ici sont les row du dataset, donc pour un element quelconque, avoir la liste de ses lignes d'occurence, prendre les valeurs de sa fenetre contextuelle et constituer ses positives examples; 

In [10]:
vocab = []
for k,v in features.items() : 
    for i in v:
        vocab.append((i,k))
print(vocab)

[('3', 'season'), ('2', 'season'), ('1', 'season'), ('0', 'season'), ('1', 'year'), ('0', 'year'), (1, 'month'), (2, 'month'), (3, 'month'), (4, 'month'), (5, 'month'), (6, 'month'), (7, 'month'), (8, 'month'), (9, 'month'), (10, 'month'), (11, 'month'), (12, 'month'), (0, 'hour'), (1, 'hour'), (2, 'hour'), (3, 'hour'), (4, 'hour'), (5, 'hour'), (6, 'hour'), (7, 'hour'), (8, 'hour'), (9, 'hour'), (10, 'hour'), (11, 'hour'), (12, 'hour'), (13, 'hour'), (14, 'hour'), (15, 'hour'), (16, 'hour'), (17, 'hour'), (18, 'hour'), (19, 'hour'), (20, 'hour'), (21, 'hour'), (22, 'hour'), (23, 'hour'), ('1', 'holiday'), ('0', 'holiday'), ('1', 'workingday'), ('0', 'workingday'), ('3', 'weather'), ('2', 'weather'), ('1', 'weather'), ('0', 'weather'), (0.8200000000000001, 'temp'), (1.6400000000000001, 'temp'), (2.46, 'temp'), (3.2800000000000002, 'temp'), (4.92, 'temp'), (5.74, 'temp'), (6.5600000000000005, 'temp'), (7.38, 'temp'), (8.200000000000001, 'temp'), (9.84, 'temp'), (9.02, 'temp'), (10.66, '

In [11]:
N = len(vocab)
emb = np.zeros(N)
d=5

les lignes des matrices de poids sont les embeddings que nous cherchons à obtenir.
Input de taille N\*N et matrice de poids de la Layer 1 est  de taille N*d, d étant la taille de l'mebdding qu'on souhaite avoir; 2e matrice de poids de taille d\*N. Dernier layer un softmax

In [12]:
np.random.seed(42)
def init_network(vocab_size, n_embedding):
    model = {
        "w1": np.random.randn(N, d),
        "w2": np.random.randn(d, N)
    }
    return model

In [13]:
def softmax(X):
    res = []
    for x in X:
        exp = np.exp(x)
        res.append(exp / exp.sum())
    return res

def forward(model, X, return_cache=True):
    cache = {}
    
    cache["a1"] = X @ model["w1"]
    cache["a2"] = cache["a1"] @ model["w2"]
    cache["z"] = softmax(cache["a2"])
    
    if not return_cache:
        return cache["z"]
    return cache


def cross_entropy(z, y):
    return - np.sum(np.log(z) * y)

def backward(model, X, y, alpha):
    cache  = forward(model, X)
    da2 = cache["z"] - y
    dw2 = cache["a1"].T @ da2
    da1 = da2 @ model["w2"].T
    dw1 = X.T @ da1
    assert(dw2.shape == model["w2"].shape)
    assert(dw1.shape == model["w1"].shape)
    model["w1"] -= alpha * dw1
    model["w2"] -= alpha * dw2
    return cross_entropy(cache["z"], y)

In [8]:
text = '''Machine learning is the study of computer algorithms that \
improve automatically through experience. It is seen as a \
subset of artificial intelligence. Machine learning algorithms \
build a mathematical model based on sample data, known as \
training data, in order to make predictions or decisions without \
being explicitly programmed to do so. Machine learning algorithms \
are used in a wide variety of applications, such as email filtering \
and computer vision, where it is difficult or infeasible to develop \
conventional algorithms to perform the needed tasks.'''

import re

def tokenize(text):
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())

In [59]:
def mapping(tokens):
    word_to_id = {}
    id_to_word = {}
    
    for i, token in enumerate(tokens):
        word_to_id[token] = i
        id_to_word[i] = token
    
    return word_to_id, id_to_word

In [60]:
word_to_id, id_to_word = mapping(vocab)

def concat(*iterables):
    for iterable in iterables:
        yield from iterable

def one_hot_encode(id, vocab_size):
    res = [0] * vocab_size
    res[id] = 1
    return res

In [61]:

def generate_training_data(tokens, word_to_id, window):
    X = []
    y = []
    n_tokens = len(tokens)
    
    for i in range(n_tokens):
        idx = concat(
            range(max(0, i - window), i), 
            range(i, min(n_tokens, i + window + 1))
        )
        
        for j in idx:
            if i == j:
                continue
            X.append(one_hot_encode(word_to_id[tokens[i]], len(word_to_id)))
            y.append(one_hot_encode(word_to_id[tokens[j]], len(word_to_id)))
    
    return np.asarray(X), np.asarray(y)

In [62]:
X, y = generate_training_data(vocab, word_to_id, 2)
X.shape

In [64]:
model = init_network(len(word_to_id), 10)

In [66]:
(X @ model["w1"] @ model["w2"]).shape

(1130, 284)

In [69]:
vocab[:5]

[('1', 'season'),
 ('3', 'season'),
 ('0', 'season'),
 ('2', 'season'),
 ('1', 'year')]

In [68]:
n_iter = 50
learning_rate = 0.05

history = [backward(model, X, y, learning_rate) for _ in range(n_iter)]


In [None]:
learning = one_hot_encode(word_to_id[('1', 'year')], len(word_to_id))
result = forward(model, [learning], return_cache=False)[0]

for word in (id_to_word[id] for id in np.argsort(result)[::-1]):
    print(word)

In [74]:
model["w1"]

array([[ 0.24821374, -0.05306013,  2.48466276, -2.0990956 , -1.4571657 ],
       [-0.03794926,  1.68185175,  1.96142912, -2.89671142, -1.52873487],
       [-0.63450616,  0.66431115,  2.27075283, -2.22860081, -2.39177609],
       ...,
       [ 0.84551666,  0.78100398,  2.79673533,  1.05317667,  1.70565515],
       [ 0.83825526,  0.44006153,  2.5116818 ,  2.7237128 ,  1.95792313],
       [ 1.10797885,  1.0960138 ,  2.04695966,  1.29541145,  2.49473848]])

Faire une deuxième lookup table pour un mapping (valeurs, features ) --> embeddings