In [1]:
%%configure -f
{"executorCores": 4, "executorMemory": "32768M", "conf": {"spark.default.parallelism": 1000}}

In [2]:
import pandas as pd

import math
import gensim
import random
from gensim.utils import tokenize
from gensim.parsing.preprocessing import remove_stopwords
import string
import spacy
from langdetect import detect
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType, ArrayType, DoubleType, MapType, BooleanType
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.sql.window import Window

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1568887420903_0001,pyspark,idle,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
df = spark.read.load("s3://ai-data-lake-dev-eu-west-1/business/capiq/company_denormalized")
df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- one_id: string (nullable = true)
 |-- company_id: integer (nullable = true)
 |-- company_name: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- region: string (nullable = true)
 |-- region_code: string (nullable = true)
 |-- company_type: string (nullable = true)
 |-- number_of_employees: integer (nullable = true)
 |-- latest_revenue: double (nullable = true)
 |-- latest_revenue_growth: double (nullable = true)
 |-- latest_ebitda: double (nullable = true)
 |-- latest_ebitda_margin: double (nullable = true)
 |-- competitor_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- simple_industry_id: string (nullable = true)
 |-- simple_industry: string (nullable = true)
 |-- sic_code: string (nullable = true)
 |-- sic_code_desc: string (nullable = true)
 |-- naics_code: string (nullable = true)
 |-- naics_code_des

In [4]:
nlp = spacy.load("en_core_web_sm")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
def filter_words(sent, company_name):
    if company_name in sent.string:
        sent = sent.string.replace(company_name, "")
        sent = nlp(sent)

    for word in sent:
        if word.is_stop or word.is_punct or not word.is_alpha or len(word.text) <= 1:
            continue
        yield word.lemma_.lower()


def process_text(text, company_name):
    doc = nlp(text)
    words = []
    for sent in doc.sents:
        if "was founded in" in sent.string or "a subsidiary of" in sent.string or "was formerly known as" in sent.string or "is based in" in sent.string:
            continue
        words.extend(filter_words(sent, company_name))

    return " ".join(words)

process_text_udf = udf(process_text, StringType())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
df = df.withColumn("clean_company_description", process_text_udf("company_description", "company_name"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
df.select("company_id","clean_company_description").show(100)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+-------------------------+
|company_id|clean_company_description|
+----------+-------------------------+
| 130217190|     joint venture bei...|
| 130909560|     chemical preparat...|
| 131112350|                         |
| 131231870|     biological produc...|
| 131258800|                         |
| 131341840|                         |
| 131342850|     electronic compon...|
| 131344640|     household applian...|
| 532079910|                         |
| 532259970|     private company e...|
| 303150120|                         |
|    908900|     acquire growth en...|
| 289482650|                         |
| 289498400|                         |
| 289539770|                         |
| 289632200|                         |
| 289635230|                         |
| 289676210|                         |
| 289677840|                         |
| 289678460|                         |
| 432741600|                         |
| 432780330|                         |
| 432781500|             

In [8]:
(df
 .write
 .parquet("s3://ai-data-lake-dev-eu-west-1/staging/peer/company_denormalized_extra_field", mode="overwrite")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
df = spark.read.load("s3://ai-data-lake-dev-eu-west-1/staging/peer/company_denormalized_extra_field")
df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- one_id: string (nullable = true)
 |-- company_id: integer (nullable = true)
 |-- company_name: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- region: string (nullable = true)
 |-- region_code: string (nullable = true)
 |-- company_type: string (nullable = true)
 |-- number_of_employees: integer (nullable = true)
 |-- latest_revenue: double (nullable = true)
 |-- latest_revenue_growth: double (nullable = true)
 |-- latest_ebitda: double (nullable = true)
 |-- latest_ebitda_margin: double (nullable = true)
 |-- competitor_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- simple_industry_id: string (nullable = true)
 |-- simple_industry: string (nullable = true)
 |-- sic_code: string (nullable = true)
 |-- sic_code_desc: string (nullable = true)
 |-- naics_code: string (nullable = true)
 |-- naics_code_des

In [6]:
df = df.select('company_id', 'clean_company_description')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
(df
 .write
 .parquet("s3://ai-data-lake-dev-eu-west-1/staging/peer/company_clean_description", mode="overwrite")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
df.show(100)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+-------------------------+
|company_id|clean_company_description|
+----------+-------------------------+
| 130217190|     joint venture bei...|
| 130909560|     chemical preparat...|
| 131112350|                         |
| 131231870|     biological produc...|
| 131258800|                         |
| 131341840|                         |
| 131342850|     electronic compon...|
| 131344640|     household applian...|
| 532079910|                         |
| 532259970|     private company e...|
| 303150120|                         |
|    908900|     acquire growth en...|
| 289482650|                         |
| 289498400|                         |
| 289539770|                         |
| 289632200|                         |
| 289635230|                         |
| 289676210|                         |
| 289677840|                         |
| 289678460|                         |
| 432741600|                         |
| 432780330|                         |
| 432781500|             