In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

## Load the data

In [213]:
df = pd.read_pickle('../test/hypebeast_clean.pkl')
df.head(3)

Unnamed: 0_level_0,article,category,comments,hypes,keywords,title,popularity
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-06-06,"After collaborations with DELUXE and BILLY’s, ...",[Footwear],[These are pretty much the same colors that've...,3639,"[Vans, Vans Era, Vans SK8-HI, Vans Old Skool, ...","Vans Keeps it Simple With New ""Color Theory"" R...",cold
2018-06-06,Taking to London Fashion Week: Men’s to show o...,[Footwear],"[yike and yawn 2, As far as big logo sneakers ...",1046,"[Nike, Nike Air Max Plus, London Fashion Week ...",ALCH Studio Displays Unseen Nike Air Max Plus ...,cold
2018-06-13,Adding to its list of anticipated E3 2018 game...,[Entertainment],[it's all about yoshimitsu],1422,"[E3, Video Games, Trailers, Sony Playstation 4...",'SOULCALIBUR VI' Set to Release With a Collect...,cold


In [217]:
# Select the training data
df = df.drop('2018-6-6')
df = df['2014':'2018']
df = df.sort_index()
df.shape

In [221]:
X =  df.title.values + df.article.values
y = df.popularity.values

## Feture Selection

In [223]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [224]:
NGRAM_RANGE = (1,2)
TOP_K = 20000
TOKEN_MODE = 'word'
MIN_DOCUMENT_FREQUENCY = 2

In [225]:
def ngram_vectorize(train_texts, train_labels):
    """
    Convert the text to the combination of unigram and bigram and select the top_k features
    
    Args:
        train_texts: original text
        train_labels: the ground truth
    Returns:
        X_train: the vectorized training data
    """
    kwargs = {
            'ngram_range': NGRAM_RANGE,
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,
            'min_df': MIN_DOCUMENT_FREQUENCY
    }
    
    vectorizer = TfidfVectorizer(**kwargs)
    
    X_train = vectorizer.fit_transform(train_texts)
    
    selector = SelectKBest(f_classif, k=min(TOP_K, X_train.shape[1]))
    selector.fit(X_train, train_labels)
    X_train = selector.transform(X_train).astype('float32')
    
    return X_train

In [226]:
X_train = ngram_vectorize(X, y)

## Concept Drift Detection

In [228]:
from sklearn.naive_bayes import MultinomialNB

### Base Learner

In [243]:
bl_clf = MultinomialNB(alpha=0.1)
bl_pred = []
bl_clf.partial_fit(X_train[:30, :], y[:30], classes=[0,1,2])

for i in range(31,X_train.shape[0]):
    bl_pred.append(bl_clf.predict_proba(X_train[i,:]))
    bl_clf.partial_fit(X_train[i,:], [y[i]])
    #bl_pred.append(bl_clf.predict_proba(X_train[i+1,:]))
    
    if i%1000==0:
        print(i,':',bl_pred[-1], y[i+1])  

1000 : [[ 0.87134688  0.12671007  0.00194305]] 0
2000 : [[ 0.42178317  0.57750408  0.00071275]] 1
3000 : [[  9.89628525e-01   1.02117627e-02   1.59712605e-04]] 1
4000 : [[ 0.28826562  0.70536591  0.00636847]] 1
5000 : [[  9.75029165e-01   2.44228133e-02   5.48021517e-04]] 0
6000 : [[ 0.70622463  0.28551     0.00826537]] 1
7000 : [[ 0.2018634   0.79429248  0.00384412]] 0
8000 : [[ 0.18932837  0.80182705  0.00884458]] 0
9000 : [[  9.81442862e-01   1.78237252e-02   7.33412737e-04]] 0
10000 : [[ 0.23492397  0.7593545   0.00572153]] 0
11000 : [[ 0.51486121  0.47662654  0.00851225]] 1
12000 : [[ 0.08082561  0.91803268  0.00114171]] 1
13000 : [[  9.87916020e-01   1.18650620e-02   2.18918316e-04]] 1
14000 : [[ 0.72010112  0.27524313  0.00465575]] 1
15000 : [[  9.92831844e-01   6.76010478e-03   4.08050894e-04]] 0
16000 : [[ 0.81522176  0.17899864  0.0057796 ]] 0
17000 : [[ 0.88627929  0.10720789  0.00651282]] 0
18000 : [[ 0.98345267  0.01525267  0.00129466]] 0
19000 : [[ 0.35838986  0.53616941 

In [244]:
y_pred = np.array(bl_pred).reshape([-1,3])
np.save('../result/concept_drift/y_pred_base_learner.npy',y_pred)

In [237]:
def p2i(p):
    """
    One-hot encoding label
    Args:
        p: ground truth
    Returns:
        One-hot encoding labels
    """
    if p=='cold':
        return 0
    elif p=='medium':
        return 1
    else:
        return 2

In [None]:
y_true = [p2i(p) for p in y[32:]]
np.save('../result/concept_drift/labels.npy', np.array(y_true))

### Drift Detection Framework

The detailed explanation of the source code is shown at DDM.py

In [267]:
p_min = 200
s_min = 200
p_con = -100
store_token = None
start_token = 0
prob = []
stat = []
n = 50
w = 1.5
d = 2.5

In [268]:
clf = MultinomialNB(alpha=0.1)
clf.partial_fit(X_train[:n,:], y[:n], classes=[0,1,2])

for end_token in range(n+1, X_train.shape[0]):
    y_pred = clf.predict_proba(X_train[end_token,:])
    
    clf.partial_fit(X_train[end_token,:], [y[end_token]], classes=[0,1,2])
    
    er = 1 - y_pred[0][y[end_token]]
    std = np.sqrt((1-er)*er/(end_token-start_token))
    
    s = 0
    
    if er<p_min:
        p_min = er
        s_min = std
        #print("Minimum: ", end_token, p_min, s_min)
        
    if er + std >= p_min + w* s_min and store_token == None:
        stoken_token = end_token
        p_con = er
        #print("Confidence level:", end_token, er, std)
        s=1
    if er < p_con:
        store_token = None
        p_con = -100
        #print("False alarm:" , end_token)
        s = 2
    if er+std >= p_min + d*s_min:
        if store_token == None:
            start_token = end_token - n
        else:
            start_token = store_token
            if end_token - store_token < n:
                start_token = end_token - n
        p_min = 200
        s_min = 200
        s = 3
        
        clf = MultinomialNB(alpha=0.1)
        clf.partial_fit(X_train[start_token:end_token, :], y[start_token:end_token], classes=[0,1,2])
        
    prob.append(y_pred)
    stat.append(s)

  self.class_log_prior_ = (np.log(self.class_count_) -


In [253]:
def save_detector(name, prob, stat):
    stat = np.array(stat)
    prob = np.array(prob).reshape([-1,3])
    np.save('../result/concept_drift/{}/stat.npy'.format(name), stat)
    np.save('../result/concept_drift/{}/prob.npy'.format(name), prob)

In [271]:
save_detector('detector_4',prob, stat)