In [1]:
%%configure -f
{"driverMemory": "47696M", 
 "executorCores": 4,
 "executorMemory": "47696M",
 "conf": {"spark.default.parallelism": 1000,
          "spark.driver.maxResultSize": "23848M",
          "spark.sql.shuffle.partitions": 1000
         }
}

In [2]:
model_path = "s3://onai-ml-dev-eu-west-1/company2vec/model/nmf"
word2id_path = "s3://onai-ml-dev-eu-west-1/company2vec/common"
data_path = "s3://onai-ml-dev-eu-west-1/company2vec/data_nmf"

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1567070972036_0001,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
from gensim.utils import tokenize
from langdetect import detect
import string
from itertools import islice
import numpy as np
from scipy.sparse import csr_matrix
import smart_open
import scipy

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.mllib.linalg import DenseMatrix
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors

from pyspark.ml.linalg import Vectors, VectorUDT

import pickle

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Train a Truncated SVD on the (sparse) company feature matrix

In [4]:
companies_data = spark.read.load("s3://ai-data-lake-dev-eu-west-1/business/capiq/company_denormalized")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
dups = [row.company_id for row in 
         companies_data
         .groupBy("company_id")
         .agg(F.count("*").alias("count"))
         .filter(F.col("count") > 1)
         .collect()
]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
companies_data = companies_data.filter(~F.col("company_id").isin(dups))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
max_desc = 1000
cap_description_udf = F.udf(lambda desc: desc[:max_desc])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
x_joined = (spark.read.load(f"{data_path}/raw_company_features")
            .join(companies_data,
                  on = ["company_id"]
                 )
            .select("company_id",
                    cap_description_udf("company_name").alias("company_name"),
                    cap_description_udf("company_description").alias("company_description"),
                    "country",
                    "sic_code",
                    "latest_revenue",
                    "size",
                    "feature_indices",
                    "feature_values"
                   )
           )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
x_joined.filter("company_id == '704634'").show(20, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+-------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
x_joined.write.parquet(f"{data_path}/company_features_joined", mode="overwrite")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
x_joined = (
    spark.read.load(f"{data_path}/company_features_joined")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
def zipped_data(x_rdd):
    ret = []
    n_rows = 0
    n_cols = None
    
    def generate_sparse_matrix(x_rdd):
        nonlocal n_cols
        nonlocal n_rows
        for i,el in enumerate(x_rdd.collect()):
            ret.append((el.company_id,
                        el.company_name,
                        el.company_description,
                        el.country,
                        el.sic_code,
                        el.latest_revenue
                       ))
            if n_cols is None:
                n_cols = el.size
            for j,v in zip(el.feature_indices, el.feature_values):
                yield (i,j,v)
            n_rows += 1

    row_ind, col_ind, data = zip(*generate_sparse_matrix(x_rdd))

    return ret, (n_rows, n_cols), (row_ind, col_ind, data)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
metadata, shape, (row_ind, col_ind, data) = zipped_data(x_joined)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
X = csr_matrix((data, (row_ind, col_ind)), shape=shape)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
X.shape

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(3381364, 44014)

In [15]:
X_metadata = np.array([
    list(row) for row in metadata
])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
with smart_open.open(f"{model_path}/X/company_info.np", "wb") as f:
    np.save(f, X_metadata)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
with smart_open.open(f"{model_path}/X/X.npz", "wb") as f:
    scipy.sparse.save_npz(f, X)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
with smart_open.open(f"{model_path}/X/company_info.np", "rb") as f:
    X_metadata = np.load(f, allow_pickle=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
with smart_open.open(f"{model_path}/X/X.npz", "rb") as f:
    X = scipy.sparse.load_npz(f)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=200, random_state=42, solver='mu', verbose=True)
X_nmf = nmf.fit_transform(X)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Epoch 10 reached after 342.465 seconds, error: 32937.629017
Epoch 20 reached after 668.018 seconds, error: 32717.593995
Epoch 30 reached after 996.334 seconds, error: 32668.686055
Epoch 40 reached after 1328.757 seconds, error: 32648.087715
Epoch 50 reached after 1659.111 seconds, error: 32636.533425
Epoch 60 reached after 1988.200 seconds, error: 32631.406401
Epoch 70 reached after 2322.251 seconds, error: 32628.843176

In [19]:
H = nmf.components_

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
H.shape

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(200, 44014)

In [21]:
with smart_open.open(f"{model_path}/H/mat.np", "wb") as f:
    np.save(f, H, allow_pickle=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
with smart_open.open(f"{model_path}/nmf.pkl", "wb") as f:
    pickle.dump(nmf, f)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [23]:
with smart_open.open(f"{model_path}/nmf.pkl", "rb") as f:
    nmf = pickle.load(f)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [24]:
with smart_open.open(f"{model_path}/H/mat.np", "rb") as f:
    H = np.load(f)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Now that we have SVD, project the data and write associated descriptions/metadata

In [25]:
company2id_ini = {}
for i,desc in enumerate(X_metadata):
    company2id_ini[desc[0]] = i

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
len(company2id_ini)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

3381364

In [27]:
query_company = 291847304

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
company2id_ini[query_company]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2048758

In [29]:
X_nmf[X_metadata[:,0] == query_company]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

array([[1.21688556e-01, 8.05209197e-08, 9.22363104e-02, 2.47182696e-07,
        0.00000000e+00, 1.55517251e-07, 8.41509921e-03, 2.39071801e-03,
        2.93587960e-02, 1.75798378e-04, 1.29339800e-09, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.79255853e-12,
        6.27086368e-03, 1.09998871e-02, 5.80987406e-07, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        3.57819520e-10, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        2.55288775e-03, 1.59145269e-02, 6.40050993e-02, 4.26949608e-02,
        0.00000000e+00, 7.84199214e-03, 0.00000000e+00, 1.28654666e-31,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 2.06898592e-05, 0.00000000e+00, 2.57394036e-02,
        0.00000000e+00, 0.00000000e+00, 7.57106428e-32, 0.00000000e+00,
        0.00000000e+00, 1.32675585e-03, 1.96460749e-03, 2.74065657e-03,
        8.75359079e-03, 1.13212849e-03, 1.44550826e-37, 1.675012

In [30]:
X_nmf = X_nmf.astype(np.float32)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [31]:
with smart_open.open(f"{model_path}/X/X_nmf.np", "wb") as f:
    np.save(f, X_nmf, allow_pickle=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [32]:
with smart_open.open(f"{model_path}/X/x_nmf.np", "wb") as f:
    np.save(f, X_nmf[:10000], allow_pickle=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…