In [28]:
import pyspark as ps
import pandas as pd
import boto3

In [29]:
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.ml.feature import NGram,Tokenizer
from pyspark.sql.functions import col

In [30]:
spark = SparkSession.builder.getOrCreate()

In [31]:
s3 = boto3.client('s3')
bucket = "blossom-data-engs"

s3.download_file(bucket,"companies.csv",'companies.csv')
s3.download_file(bucket,"alldata.csv",'alldata.csv')

In [32]:
companies = spark.read.csv('companies.csv',header=True, inferSchema=True, multiLine = True)
alldata = spark.read.csv('alldata.csv',header=True, inferSchema=True, multiLine = True)

In [33]:
alldata = alldata.withColumnRenamed('description','descriptions')
companies = companies.withColumnRenamed('company name','company_name')

In [34]:
alldata.columns

['position', 'company', 'descriptions', 'reviews', 'location\r']

In [35]:
companies.columns

['ticker',
 'company_name',
 'short name',
 'industry',
 'description',
 'website',
 'logo',
 'ceo',
 'exchange',
 'market cap',
 'sector',
 'tag 1',
 'tag 2',
 'tag 3\r']

In [36]:
alldata = alldata.alias('alldata')
companies = companies.alias('companies')

In [37]:
ds =  companies.join(alldata, companies.company_name == alldata.company, 'inner')

In [38]:
ds.select('company_name','company').show()

+--------------------+--------------------+
|        company_name|             company|
+--------------------+--------------------+
|   Cubic Corporation|   Cubic Corporation|
| The Hershey Company| The Hershey Company|
| Abbott Laboratories| Abbott Laboratories|
| Centene Corporation| Centene Corporation|
|           eBay Inc.|           eBay Inc.|
|        Trimble Inc.|        Trimble Inc.|
|   Cabot Corporation|   Cabot Corporation|
| Celgene Corporation| Celgene Corporation|
|               AECOM|               AECOM|
|  Kemper Corporation|  Kemper Corporation|
|Discover Financia...|Discover Financia...|
|               AECOM|               AECOM|
|          TransUnion|          TransUnion|
|       Wipro Limited|       Wipro Limited|
| Synchrony Financial| Synchrony Financial|
|          TransUnion|          TransUnion|
|Discover Financia...|Discover Financia...|
|          TransUnion|          TransUnion|
|Vanda Pharmaceuti...|Vanda Pharmaceuti...|
|Vanda Pharmaceuti...|Vanda Phar

In [39]:
num = input('Press 1 for unigram and 2 for bigram   ')

Press 1 for unigram and 2 for bigram   1


In [40]:
def ngram(num,ds,column):
    tokens = Tokenizer(inputCol = column, outputCol = 'tokens')
    dt = tokens.transform(ds) 
    
    ngrams = NGram(n=num, inputCol = 'tokens', outputCol = 'ngrams')
    df = ngrams.transform(dt)
    
    return df

In [42]:
ngram(1,ds,'descriptions').select('ngrams').show(20)

+--------------------+
|              ngrams|
+--------------------+
|[job, summary:, ,...|
|[job, title:, foo...|
|[at, abbott,, we'...|
|[provide, vision,...|
|[at, ebay,, you, ...|
|[data, scientist,...|
|[position, summar...|
|[other, locations...|
|[aecom, is, activ...|
|[position, summar...|
|[as, a, master, d...|
|[aecom, is, seeki...|
|[–, mohit, kapoor...|
|[he/she, will, le...|
|[job, description...|
|[what, we’ll, bri...|
|[as, a, lead, dat...|
|[–, mohit, kapoor...|
|["position, descr...|
|[responsibilities...|
+--------------------+
only showing top 20 rows



In [43]:
def outputs(df):
    industry = df.select(['ngrams', 'industry']).limit(3)\
        .select('industry', F.explode('ngrams').alias('ngrams')).groupby(['ngrams','industry']).count()
    city = df.select(['ngrams', 'location']).limit(3)\
        .select('location', F.explode('ngrams').alias('ngrams')).groupby(['ngrams','location']).count()
    city = city.withColumnRenamed('location','city')
    return industry,city