# Incremental Machine Learning with River

In [1]:
import river
import pandas as pd

In [2]:
dir(river)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'annotations']

In [3]:
from river.linear_model import LogisticRegression
from river.naive_bayes import MultinomialNB
from river.feature_extraction import BagOfWords,TFIDF

## Load Estimators

In [5]:
def get_all_attributes(package):
    subpackages = []
    submodules = []
    for i in dir(package):
        if str(i) not in ["__all__", "__builtins__", "__cached__", "__doc__", "__file__", "__loader__", "__name__", "__package__", "__path__", "__pdoc__", "__spec__", "__version__"]:
            subpackages.append(i)
            res = [j for j in dir(eval("river.{}".format(i)))]
            submodules.append(res)
    df = pd.DataFrame(submodules)
    # Transpose
    df = df.T
    df.columns = subpackages
    res_df = df.dropna()
    return res_df

In [6]:
river_df = get_all_attributes(river)

In [7]:
river_df

Unnamed: 0,annotations,base,covariance,feature_extraction,linear_model,naive_bayes,optim,proba,stats,utils
0,__class__,Base,EmpiricalCovariance,Agg,ALMAClassifier,BernoulliNB,AMSGrad,Beta,AbsMax,Rolling
1,__delattr__,BinaryDriftAndWarningDetector,EmpiricalPrecision,BagOfWords,BayesianLinearRegression,ComplementNB,AdaBound,Gaussian,AutoCorr,SortedWindow
2,__dict__,BinaryDriftDetector,__all__,PolynomialExtender,LinearRegression,GaussianNB,AdaDelta,Multinomial,BayesianMean,TimeRolling
3,__dir__,Classifier,__builtins__,RBFSampler,LogisticRegression,MultinomialNB,AdaGrad,MultivariateGaussian,Count,VectorDict
4,__doc__,Clusterer,__cached__,TFIDF,PAClassifier,__all__,AdaMax,__all__,Cov,__all__
5,__eq__,DriftAndWarningDetector,__doc__,TargetAgg,PARegressor,__builtins__,Adam,__builtins__,EWMean,__builtins__
6,__format__,DriftDetector,__file__,__all__,Perceptron,__cached__,Averager,__cached__,EWVar,__cached__
7,__ge__,Ensemble,__loader__,__builtins__,SoftmaxRegression,__doc__,FTRLProximal,__doc__,Entropy,__doc__
8,__getattribute__,Estimator,__name__,__cached__,__all__,__file__,Momentum,__file__,IQR,__file__
9,__gt__,MiniBatchClassifier,__package__,__doc__,__builtins__,__loader__,Nadam,__loader__,KolmogorovSmirnov,__loader__


#### Requirement
+ list of tuple
+ dictionary
+ CSV
    - list of tuples or dictionary record
    - iter_csv
    - iter_pandas

In [8]:
### Data: Predict if a text if hardware or software related
data = [("my unit test failed","software"),
("tried the program, but it was buggy","software"),
("i need a new power supply","hardware"),
("the drive has a 2TB capacity","hardware"),
("unit-tests","software"),
("program","software"),
("power supply","hardware"),
("drive","hardware"),
("it needs more memory","hardware"),
("check the API","software"),
("design the API","software"),
("they need more CPU","hardware"),
("code","software"),
("i found some bugs in the code","software"),
("i swapped the memory","hardware"),
("i tested the code","software")]

test_data = [('he writes code daily','software'), 
             ('the disk is faulty','hardware'), 
             ("refactor the code","software"),
             ('no empty space on the drive','hardware')]

### Text classification
+ vectorized the text
  - CountVectorizer/ BagOfWords
  - TFIDF
+ build model on the go

## Make a Pipeline

In [14]:
# from river.compose import Pipeline
from river import compose

In [18]:
from river.compose import Pipeline

In [19]:
pipe_nb = Pipeline(('vectorizer', BagOfWords(lowercase=True)),('nb', MultinomialNB()))

# pipe_nb = compose.Pipeline(
#     ('vectorizer', BagOfWords(lowercase=True)),
#     ('nb', MultinomialNB())
# )

### Vizualize pipeline

In [11]:
pipe_nb

In [12]:
# Get steps
pipe_nb.steps

OrderedDict([('vectorizer',
              BagOfWords (
                on=None
                strip_accents=True
                lowercase=True
                preprocessor=None
                stop_words=None
                tokenizer_pattern="(?u)\b\w[\w\-]+\b"
                tokenizer=None
                ngram_range=(1, 1)
              )),
             ('nb',
              MultinomialNB (
                alpha=1.
              ))])

### Fit on data

+ Learn one at a time
    - learn one(for river) / fit one(for creme)
    - predict one

In [20]:
# for text,label in data:
#     # print(text)
#     # print(label)
#     pipe_nb = pipe_nb.learn_one(text, label)

for text, label in data:
    pipe_nb.learn_one(text, label)

In [22]:
# make prediction
pipe_nb.predict_one("I built an API")

'software'

In [23]:
# other
pipe_nb.predict_one("the hard drive in the computer is damaged")

'software'

### Predict probability

In [27]:
#prob
pipe_nb.predict_proba_one("I built an API")

{'software': 0.7416584917539436, 'hardware': 0.2583415082460559}

In [28]:
#other
pipe_nb.predict_proba_one("the hard drive in the computer is damaged")

{'software': 0.6093600696209751, 'hardware': 0.39063993037902633}

### Evaluate & classification
+ Accuracy
+ Precision/ F1, Recall on a prediction

In [29]:
test_data

[('he writes code daily', 'software'),
 ('the disk is faulty', 'hardware'),
 ('refactor the code', 'software'),
 ('no empty space on the drive', 'hardware')]

In [32]:
y_pred = []
for x,y in test_data:
    print(x + ": " + pipe_nb.predict_one(x))

    res = pipe_nb.predict_one(x)
    y_pred.append(res)

he writes code daily: software
the disk is faulty: software
refactor the code: software
no empty space on the drive: hardware


In [42]:
# Classification
from river import metrics

In [43]:
report = metrics.ClassificationReport()

In [44]:
# get y truely test
y_pred = []
y_test = []
for x,y in test_data:
    print(x + ": " + pipe_nb.predict_one(x))

    res = pipe_nb.predict_one(x)
    y_pred.append(res)
    y_test.append(y)

he writes code daily: software
the disk is faulty: software
refactor the code: software
no empty space on the drive: hardware


In [36]:
print(y_test)
print(y_pred)

['software', 'hardware', 'software', 'hardware']
['software', 'software', 'software', 'hardware']


In [45]:
for yt,yp in zip(y_test, y_pred):
    report.update(yt,yp)

In [46]:
print(report)

           Precision   Recall    F1       Support  
                                                   
hardware     100.00%    50.00%   66.67%         2  
software      66.67%   100.00%   80.00%         2  
                                                   
   Macro      83.33%    75.00%   73.33%            
   Micro      75.00%    75.00%   75.00%            
Weighted      83.33%    75.00%   73.33%            

                  75.00% accuracy                  


In [54]:
# update the model on test data & check accuracy
metric = river.metrics.Accuracy()
for text,label in test_data:
    # print label
    y_pred_before = pipe_nb.predict_one(text)
    metric.update(label, y_pred_before)
    pipe_nb.learn_one(text, label)

In [55]:
metric

Accuracy: 100.00%

In [56]:
# update the model & check accuracy
metric2 = river.metrics.Accuracy()
for text,label in data:
    # print label
    y_pred_before = pipe_nb.predict_one(text)
    metric2.update(label, y_pred_before)
    pipe_nb.learn_one(text, label)

In [57]:
metric2

Accuracy: 100.00%