In [1]:
# %%configure -f
# {
#  "executorCores": 4, 
#  "executorMemory": "47696M", 
#  "driverMemory": "47696M",
#  "conf": {
#           "spark.task.cpus": 1,
#           "spark.kryoserializer.buffer.max": "2047m",
#           "spark.driver.maxResultSize": "10g"
#          }
# }

In [1]:
import os
from pyspark.sql import SparkSession

import pyspark.sql.functions as F
import pyspark.sql.types as T

import smart_open

from scipy.sparse import csr_matrix
import scipy

import numpy as np

from itertools import islice

In [2]:
spark = (
    SparkSession.builder
     .master("local[*]")
     .appName("SparkTest")
     .config("spark.kryoserializer.buffer.max", "2047m")
     .config("spark.driver.maxResultSize", "10g")
     .getOrCreate()
     )

In [3]:
import numpy as np

from scipy.special import psi
from scipy.sparse import csr_matrix, hstack, lil_matrix, vstack

from gensim.matutils import mean_absolute_difference


class SimpleLDA:
    def __init__(self, word2id, 
                 industries, 
                 a=1.0, 
                 b=1.0, 
                 e_step_iter=50,
                 print_every=25,
                 estep_threshold=1e-5
                ):
        self.a = a
        self.b = b
        self.estep_threshold = estep_threshold
        
        self.word2id = word2id
        self.industries = industries
        id2word = {}
        for k,v in word2id.items():
            id2word[v] = k
        self.id2word = id2word
        self.e_step_iter = e_step_iter
        self.print_every = print_every
    
    def _e_step(self, X, metadata):
        import numpy
        from scipy.special import psi
        from scipy.sparse import csr_matrix, hstack, lil_matrix, vstack
        
        a,b = self.a, self.b
        D,V = X.shape
        q_theta = np.random.gamma(1.0, size=(D,2))
        q_z = lil_matrix((D,V))
        phi = self.phi

        for industry in range(len(self.industries)):
            ind = metadata==industry
            q_theta_ind = q_theta[ind,:]
            X_ind = X[ind]
            q_z_ind = lil_matrix(X_ind.shape)
            d,v = X_ind.nonzero()
            q_theta_ind_old = q_theta_ind.copy()
            
            for i in range(self.e_step_iter):
                # Compute q(z)
                coef = np.clip(psi(q_theta_ind[d,-1])-psi(np.sum(q_theta_ind[d], axis=1)), a_min=-100.0, a_max=100.0)
                
                bg_w = np.exp(coef)*phi[-1,v]
                ind_w = np.exp(-coef)*phi[industry,v]
                q_z_ind[d,v] = bg_w/(bg_w+ind_w+1e-9)

                # Compute q(theta)
                q_theta_ind[:,0] = a+np.sum(q_z_ind.multiply(X_ind), axis=1).ravel()
                q_theta_ind[:,1] = b+np.sum(X_ind-q_z_ind.multiply(X_ind), axis=1).ravel()
                
                if mean_absolute_difference(q_theta_ind.ravel(), q_theta_ind_old.ravel()) <= self.estep_threshold:
                    break
                q_theta_ind_old = q_theta_ind.copy()

            q_z[ind] = q_z_ind
            q_theta[ind] = q_theta_ind
        
        return q_z, q_theta
    
    def _m_step(self, X, q_z, metadata):
        # Why PySpark? Whyyyyyyyy?
        import numpy
        from scipy.special import psi
        from scipy.sparse import csr_matrix, hstack, lil_matrix, vstack
        
        industries = self.industries
        id2word = self.id2word
        _sstats = np.zeros(shape=(len(industries)+1, len(id2word)))

        for industry in range(len(industries)):
            ind = metadata==industry
            q_z_ind = q_z[ind]
            X_ind = X[ind]
            mlt = q_z_ind.multiply(X_ind)
            q_z_sm_bg = np.sum(mlt, axis=0)
            q_z_sm_ind = np.sum(X_ind - mlt, axis=0)

            # Background sstats
            _sstats[-1,:] = _sstats[-1,:] + q_z_sm_bg

            # Industry sstats
            _sstats[industry,:] = _sstats[industry,:] + q_z_sm_ind
        
        return _sstats
        
    def _update_phi(self):
        self.phi = self._sstats/(np.sum(self._sstats, axis=1, keepdims=True) + 1e-9)
        self._sstats = np.zeros(shape=(len(self.industries)+1, len(self.id2word)))

    def train(self, X, metadata, n_iter=50):
        industries = self.industries
        id2word = self.id2word
        self._sstats = np.zeros(shape=(len(industries)+1, len(id2word)))
        self.phi = np.random.dirichlet([1.0]*(len(id2word)), size=(len(industries)+1))

        for i in range(n_iter):
            if i%self.print_every == 0:
                print(i)
            q_z,_ = self._e_step(X, metadata)
            self._sstats = self._m_step(X, q_z, metadata)
            self._update_phi()
            
    def train_distributed(self, X_metadata_rdd, n_iter=50):
        industries = self.industries
        id2word = self.id2word
        self._sstats = np.zeros(shape=(len(industries)+1, len(id2word)))
        self.phi = np.random.dirichlet([1.0]*(len(id2word)), size=(len(industries)+1))
        
        for i in range(n_iter):
            print(i)
            self._sstats = (X_metadata_rdd
                             .mapPartitions(lambda u: [list(u)])
                             .map(lambda u: (vstack([el[0] for el in u], format="lil"), 
                                             np.array([el[1] for el in u])
                                             )
                                  )
                             .map(lambda line: (line[0], line[1], self._e_step(line[0], line[1])))
                             .map(lambda line: self._m_step(line[0], line[2][0], line[1]))
                             .treeReduce(lambda a,b: a+b, depth=3)
                            )
            self._update_phi()

In [4]:
word2id_path = "s3://onai-ml-dev-eu-west-1/company2vec/common"
data_path = "s3://onai-ml-dev-eu-west-1/company2vec/data_desc_only"
model_path = "s3://onai-ml-dev-eu-west-1/company2vec/model/probabilistic/"

In [20]:
companies_raw = (spark.read.load("s3://ai-data-lake-dev-eu-west-1/business/company_data_denormalized")
                 .filter(F.col("primary_sic_node_desc").isNotNull() &
                         (F.length("primary_sic_node_desc") > 0)
                        )
                 .join(spark.read.load(f"{data_path}/raw_company_features"),
                       on = ["entity_id"]
                      )
                 .select("bow_description", "primary_sic_node_desc")
                 )

In [6]:
industries = sorted([el[0] for el in companies_raw.select("primary_sic_node_desc").distinct().collect()])
industry2idx = {}
for i,el in enumerate(industries):
    industry2idx[el] = i

In [7]:
with smart_open.open(f"{model_path}/simple_LDA/industries", "w") as f:
    for industry,idd in sorted(industry2idx.items()):
        f.write(f"{industry},{idd}\n")

In [17]:
industry2idx = {}
with smart_open.open(f"{model_path}/simple_LDA/industries", "r") as f:
    for line in f:
        toks = line.strip().split(",")
        industry,idd = ",".join(toks[:-1]), toks[-1]
        industry2idx[industry] = int(idd)
industries = list(industry2idx.keys())

In [11]:
word2id = {}
with smart_open.open(f"{word2id_path}/bow/word2id.csv", "r") as f:
    for line in f:
        word,idd = line.strip().split(",")
        word2id[word] = int(idd)

In [15]:
len(industries)

1400

In [21]:
n_docs = companies_raw.count()
n_words = len(word2id)

In [22]:
def dict_to_sparse(word_dict):
    row = []
    col = []
    data = []
    for word,freq in word_dict.items():
        if word not in word2id:
            continue
        row.append(0)
        col.append(word2id[word])
        data.append(freq)
    return csr_matrix((data,(row,col)), shape=(1, len(word2id)))

In [23]:
X_metadata_rdd = (companies_raw
                  .limit(10000)
                  .rdd
                  .map(lambda row: (dict_to_sparse(row.bow_description), industry2idx[row.primary_sic_node_desc]))
                  .cache()
                  )

In [24]:
model = SimpleLDA(word2id, industries, print_every=25)

In [25]:
model.train_distributed(X_metadata_rdd, n_iter=50)

0
1


In [26]:
with smart_open.open(f"{model_path}/simple_LDA/parameters/phi.np", "wb") as f:
    np.save(f, model.phi, allow_pickle=False)

In [21]:
def plot_topics(model):
    phi = model.phi
    id2word = model.id2word

    for i,topic in enumerate(phi):
        topic_name = model.industries[i] if i < len(model.industries) else "background"
        print("#"*20, topic_name, "#"*20)
        for prob,i in islice(sorted([(prob,i) for i,prob in enumerate(topic)], reverse=True), 20):
            print(f"{id2word[i]}:{prob} ", end="")
        print()
        print("#"*80)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
plot_topics(model)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#################### Nonferrous wiredrawing and insulating ####################
cable:0.05237458999735617 cabl:0.04374818693896037 wire:0.03758647046868936 copper:0.02464686588110878 electr:0.017203470007451236 power:0.016363493730283226 ltd:0.015112788032487987 optic:0.014171947875326071 conductor:0.012939604587582112 aluminum:0.012939604587582067 product:0.012909566732954663 china:0.012491276487331658 manufactur:0.009174494681765188 telecommun:0.008626403058263744 pvc:0.008010231411360355 fiber:0.008010231409617426 material:0.006727384651042115 co:0.006546063597341229 network:0.006378495160594028 sell:0.0063693490445886295 
################################################################################
#################### Water transportation ####################
vessel:0.021300945336887693 ship:0.017179277708899394 oper:0.015113312171671604 transport:0.014390287384090133 port:0.013088531082384574 service:0.012531933586410705 contain:0.010233840468350564 cargo:0.00975224005413053 s