In [1]:
%%configure -f
{"driverMemory": "47696M", 
 "executorCores": 4,
 "executorMemory": "47696M",
 "conf": {"spark.default.parallelism": 1000,
          "spark.driver.maxResultSize": "23848M",
          "spark.sql.shuffle.partitions": 1000
         }
}

In [2]:
model_path = "s3://onai-ml-dev-eu-west-1/company2vec/model/svd"
word2id_path = "s3://onai-ml-dev-eu-west-1/company2vec/common"
data_path = "s3://onai-ml-dev-eu-west-1/company2vec/data"

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
14,application_1566985851061_0015,pyspark,idle,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
from gensim.utils import tokenize
from langdetect import detect
import string
from itertools import islice
import numpy as np
from scipy.sparse import csr_matrix
import smart_open
import scipy

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.mllib.linalg import DenseMatrix
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors

from pyspark.ml.linalg import Vectors, VectorUDT

import pickle

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Train a Truncated SVD on the (sparse) company feature matrix

In [4]:
companies_data = spark.read.load("s3://ai-data-lake-dev-eu-west-1/business/capiq/company_denormalized")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
dups = [row.company_id for row in 
         companies_data
         .groupBy("company_id")
         .agg(F.count("*").alias("count"))
         .filter(F.col("count") > 1)
         .collect()
]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
companies_data = companies_data.filter(~F.col("company_id").isin(dups))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
max_desc = 1000
cap_description_udf = F.udf(lambda desc: desc[:max_desc])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
x_joined = (spark.read.load(f"{data_path}/raw_company_features")
            .join(companies_data,
                  on = ["company_id"]
                 )
            .select("company_id",
                    cap_description_udf("company_name").alias("company_name"),
                    cap_description_udf("company_description").alias("company_description"),
                    "country",
                    "sic_code",
                    "latest_revenue",
                    "size",
                    "feature_indices",
                    "feature_values"
                   )
           )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
x_joined.filter("company_id == '704634'").show(20, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+-------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
x_joined.write.parquet(f"{data_path}/company_features_joined", mode="overwrite")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
x_joined = (
    spark.read.load(f"{data_path}/company_features_joined")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
x_joined.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- company_id: integer (nullable = true)
 |-- company_name: string (nullable = true)
 |-- company_description: string (nullable = true)
 |-- country: string (nullable = true)
 |-- sic_code: string (nullable = true)
 |-- latest_revenue: double (nullable = true)
 |-- size: integer (nullable = true)
 |-- feature_indices: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- feature_values: array (nullable = true)
 |    |-- element: double (containsNull = true)

In [13]:
def zipped_data(x_rdd):
    ret = []
    n_rows = 0
    n_cols = None
    
    def generate_sparse_matrix(x_rdd):
        nonlocal n_cols
        nonlocal n_rows
        for i,el in enumerate(x_rdd.collect()):
            ret.append((el.company_id,
                        el.company_name,
                        el.company_description,
                        el.country,
                        el.sic_code,
                        el.latest_revenue
                       ))
            if n_cols is None:
                n_cols = el.size
            for j,v in zip(el.feature_indices, el.feature_values):
                yield (i,j,v)
            n_rows += 1

    row_ind, col_ind, data = zip(*generate_sparse_matrix(x_rdd))

    return ret, (n_rows, n_cols), (row_ind, col_ind, data)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
metadata, shape, (row_ind, col_ind, data) = zipped_data(x_joined)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
X = csr_matrix((data, (row_ind, col_ind)), shape=shape)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
X.shape

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(3381364, 44014)

In [17]:
X_metadata = np.array([
    list(row) for row in metadata
])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
with smart_open.open(f"{model_path}/X/company_info.np", "wb") as f:
    np.save(f, X_metadata)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
with smart_open.open(f"{model_path}/X/X.npz", "wb") as f:
    scipy.sparse.save_npz(f, X)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=200, n_iter=20, random_state=42)
svd.fit(X)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

TruncatedSVD(algorithm='randomized', n_components=200, n_iter=20,
             random_state=42, tol=0.0)

In [21]:
V = svd.components_

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
V.shape

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(200, 44014)

In [23]:
with smart_open.open(f"{model_path}/V/mat.np", "wb") as f:
    np.save(f, V, allow_pickle=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [24]:
with smart_open.open(f"{model_path}/truncated_svd.pkl", "wb") as f:
    pickle.dump(svd, f)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
with smart_open.open(f"{model_path}/truncated_svd.pkl", "rb") as f:
    svd = pickle.load(f)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
with smart_open.open(f"{model_path}/V/mat.np", "rb") as f:
    V = np.load(f)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Now that we have SVD, project the data and write associated descriptions/metadata

In [27]:
X_pca = X.dot(V.T)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
company2id_ini = {}
for i,desc in enumerate(metadata):
    company2id_ini[desc[0]] = i

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [29]:
len(company2id_ini)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

3381364

In [30]:
query_company = 291847304

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [31]:
company2id_ini[query_company]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2048758

In [32]:
X_pca[X_metadata[:,0] == query_company]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

array([[ 6.99342476e+00,  1.23361207e-01,  7.55503175e+00,
        -1.47180055e+00, -1.66754650e-01, -4.78843013e-01,
         1.27392257e-01,  3.82081615e-01,  1.49364915e+00,
        -6.78287935e-01, -1.09563525e+00,  1.79687146e-02,
        -1.93578629e+00, -6.28666184e-01, -4.77632849e-01,
         2.21428934e+00,  1.75651998e+00,  1.35866353e+00,
         1.50278603e+00, -6.95638706e-01,  1.35153342e-02,
        -1.15403851e-01, -6.03382160e-01,  1.60183587e-01,
         1.47578413e-01,  5.97508492e-01, -1.11147658e-02,
        -6.06817081e-01,  1.16312617e+00, -2.51390059e-01,
         1.57477443e-01, -1.18196771e+00,  1.09097164e+00,
         3.30627755e+00,  6.91424056e-01, -2.11149030e+00,
         5.91365602e-02, -8.14749769e-01, -7.79311408e-01,
        -2.09223824e+00, -4.07138793e-01,  8.76569199e-01,
        -6.35536759e-01, -2.72670741e-01,  1.53076927e+00,
        -2.05945185e-01, -1.38593745e+00, -5.67628682e-01,
         1.05902422e+00, -1.36174678e+00, -1.08607428e+0

In [33]:
X_pca = X_pca.astype(np.float32)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [34]:
with smart_open.open(f"{model_path}/X/X_pca.np", "wb") as f:
    np.save(f, X_pca, allow_pickle=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [35]:
with smart_open.open(f"{model_path}/X/x_pca.np", "wb") as f:
    np.save(f, X_pca[:10000], allow_pickle=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…