# Using Spark from Local

In [None]:
!conda install -c conda-forge -y pyspark==2.4.3

In [7]:
def get_aws_credentials():
    """
        Loads AWS credentials from credentials file and sets env variables
    """
    credentials_file = os.getenv("HOME") + '/.aws/credentials'

    try:
        with open(credentials_file, 'r') as f:
            lines = f.read()
        config = dict([tuple([kv.strip() for kv in line.split('=')]) for line in lines.split('\n') if '=' in line])
    except FileNotFoundError:
        config = dict()
    if "aws_access_key_id" in config and "aws_secret_access_key" in config:
        return config["aws_access_key_id"], config["aws_secret_access_key"]
    elif "AWS_ACCESS_KEY_ID" in os.environ and "AWS_SECRET_ACCESS_KEY" in os.environ:
        return os.environ["AWS_ACCESS_KEY_ID"], os.environ["AWS_SECRET_ACCESS_KEY"]
    return None, None

In [8]:
import os
from pyspark.sql import SparkSession

# Set Spark Home
os.environ["SPARK_HOME"] = "/Users/hakan.ilter/dev/spark-2.4.3-bin-hadoop2.7"

# Get AWS credentials
aws_access_key_id, aws_secret_access_key = get_aws_credentials()

# Create Spark Session
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("SparkTest") \
    .config("spark.sql.warehouse.dir", "/tmp/spark-warehouse") \
    .config("spark.hadoop.fs.AbstractFileSystem.s3.impl", "org.apache.hadoop.fs.s3a.S3A") \
    .config("spark.hadoop.fs.AbstractFileSystem.s3a.impl", "org.apache.hadoop.fs.s3a.S3A") \
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.driver.memory", "2g") \
    .config("spark.hadoop.fs.s3.access.key", aws_access_key_id) \
    .config("spark.hadoop.fs.s3.secret.key", aws_secret_access_key) \
    .config("spark.hadoop.fs.s3a.access.key", aws_access_key_id) \
    .config("spark.hadoop.fs.s3a.secret.key", aws_secret_access_key) \
    .getOrCreate()
spark

In [10]:
df = spark.read.json("s3://ai-data-lake-dev-eu-west-1/staging/datafox/company/t=1563188581")
df.createOrReplaceTempView("datafox")
df.printSchema()

root
 |-- icon_url: string (nullable = true)
 |-- id: string (nullable = true)
 |-- last_modified_date: string (nullable = true)
 |-- location: string (nullable = true)
 |-- name: string (nullable = true)
 |-- object_type: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- url: string (nullable = true)



In [13]:
import pandas as pd

pdf = spark.sql("SELECT name, url FROM datafox").toPandas()
pdf

Unnamed: 0,name,url
0,Society of Payment Security Professionals (SPS...,paymentsecuritypros.com
1,ASP Media Ltd,aspmedialtd.com
2,Refrigerated Transporter,refrigeratedtransporter.com
3,WCN Group,wcngroup.com
4,Woodman Asset Management,woodman.ch
5,YourOnlineTVchannel.com,youronlinetvchannel.com
6,Kumatech Inc.,kumatech.ca
7,CASE,casebio.com
8,Ravica,ravica.com
9,IT Structures Ltd.,itstructures.com


In [15]:
pdf.to_csv("/tmp/company-urls.csv", index=False, sep="\t")