# BinaryContextTransformer Examples

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer
from binarycontexttransformer import BinaryContextTransformer

In [2]:
data = [
    ("text", "text me if ur doing anything 2nite"),
    ("tweet", "Holla! Anyone doing anything tonight?"),
    ("email", "Sent you a text. What are you doing tonight?")
]
df = pd.DataFrame(data, columns=["type", "message"])
df

Unnamed: 0,type,message
0,text,text me if ur doing anything 2nite
1,tweet,Holla! Anyone doing anything tonight?
2,email,Sent you a text. What are you doing tonight?


In [3]:
vzr_type = CountVectorizer(analyzer="word", binary=True)
X_type = vzr_type.fit_transform(df["type"])
vzr_msg = CountVectorizer(analyzer="word", binary=True)
X_msg = vzr_msg.fit_transform(df["message"])

In [4]:
X_all = sp.sparse.hstack([X_type, X_msg])
X_all.todense()

matrix([[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0],
        [0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1]], dtype=int64)

In [5]:
help(BinaryContextTransformer)

Help on class BinaryContextTransformer in module binarycontexttransformer:

class BinaryContextTransformer(sklearn.base.TransformerMixin)
 |  Expands base features into interaction terms when they appear with
 |  different context features. Both base features and context features
 |  must be binary.
 |  
 |  Method resolution order:
 |      BinaryContextTransformer
 |      sklearn.base.TransformerMixin
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, features, contexts, progress=None)
 |      Args:
 |          features: names of base features
 |          contexts: names of context features
 |          progress: function of format progress_fn(iter, total) that takes
 |              an iterable and an integer with the total number of items and
 |              returns a generator to track progress at each step of the
 |              iterable (default=None)
 |  
 |  fit(self, X, X_context)
 |      Args:
 |          X: matrix of base feature columns
 |         

In [6]:
import time


def progress_bar(iter, total):
    start = time.time()
    for i, v in enumerate(iter):
        print("{}/{}".format(i, total))
        yield v
    dur = time.time() - start
    print("Ran {} iterations in {:.1f} secs.".format(total, dur))

In [7]:
bct = BinaryContextTransformer(
    features=vzr_msg.get_feature_names(),
    contexts=vzr_type.get_feature_names(),
    progress=progress_bar
)
X_msg_type = bct.fit_transform(X_msg, X_type)

0/3
1/3
2/3
Ran 3 iterations in 0.0 secs.
0/4
1/4
2/4
3/4
Ran 4 iterations in 0.0 secs.
0/3
1/3
2/3
Ran 3 iterations in 0.0 secs.


In [8]:
N = len(data)
M = len(vzr_type.get_feature_names())
F = len(vzr_msg.get_feature_names())
n_possible = X_msg.sum()
n_actual = len(bct.get_feature_names())
print("N: Number of Records = {}".format(N))
print("M: Number of Records = {}".format(M))
print("F: Number of Records = {}".format(F))
print("S: Sparsity = {:.3f}".format(X_all.sum() / (M * F)))
print("Maximum Interactions = {}".format(M * F))
print("Possible Interactions = {}".format(n_possible))
print("Actual Interactions = {}".format(n_actual))

N: Number of Records = 3
M: Number of Records = 3
F: Number of Records = 14
S: Sparsity = 0.524
Maximum Interactions = 42
Possible Interactions = 19
Actual Interactions = 9


In [9]:
print(X_msg_type.todense())

[[1 0 0 1 0 0 1 0 0]
 [0 1 0 0 1 0 0 0 1]
 [0 0 1 0 0 1 0 1 0]]


In [10]:
bct.get_feature_names()

['text_x_anything',
 'tweet_x_anything',
 'email_x_doing',
 'text_x_doing',
 'tweet_x_doing',
 'email_x_text',
 'text_x_text',
 'email_x_tonight',
 'tweet_x_tonight']

In [11]:
bct.col_pairs

[(1, 2), (2, 2), (0, 4), (1, 4), (2, 4), (0, 9), (1, 9), (0, 10), (2, 10)]

In [12]:
msg_features = vzr_msg.get_feature_names()
type_features = vzr_type.get_feature_names()
for c, f in bct.col_pairs:
    print("{} x {}".format(type_features[c], msg_features[f]))

text x anything
tweet x anything
email x doing
text x doing
tweet x doing
email x text
text x text
email x tonight
tweet x tonight


In [13]:
bct.vocabulary

{'email_x_doing': 2,
 'email_x_text': 5,
 'email_x_tonight': 7,
 'text_x_anything': 0,
 'text_x_doing': 3,
 'text_x_text': 6,
 'tweet_x_anything': 1,
 'tweet_x_doing': 4,
 'tweet_x_tonight': 8}