In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = "/home/ec2-user/spark-2.4.4-bin-hadoop2.7/python"
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_161/jre"
os.environ["SPARK_HOME"] = "/home/ec2-user/spark-2.4.4-bin-hadoop2.7"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

## Create a new spark session

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

## Read data

In [5]:
data = spark.read.csv('investments.csv',inferSchema=True, header=True)

In [6]:
data.show()

+--------------------+------------------+--------------------+--------------------+--------------------+-------------------+---------+------------+----------+--------------------+-----------------+--------------+-------------------+-------------+---------------+------------+-------------------+-------------------+-------+-------+-------------------+-----------+----------------+--------------+-----+-----+--------------+---------------+-------------+----------------+--------------------+-------+-------+-------+-------+-------+-------+-------+-------+
|           permalink|              name|        homepage_url|       category_list|             market | funding_total_usd |   status|country_code|state_code|              region|             city|funding_rounds|         founded_at|founded_month|founded_quarter|founded_year|   first_funding_at|    last_funding_at|   seed|venture|equity_crowdfunding|undisclosed|convertible_note|debt_financing|angel|grant|private_equity|post_ipo_equity|post_ip

In [7]:
data.columns

['permalink',
 'name',
 'homepage_url',
 'category_list',
 ' market ',
 ' funding_total_usd ',
 'status',
 'country_code',
 'state_code',
 'region',
 'city',
 'funding_rounds',
 'founded_at',
 'founded_month',
 'founded_quarter',
 'founded_year',
 'first_funding_at',
 'last_funding_at',
 'seed',
 'venture',
 'equity_crowdfunding',
 'undisclosed',
 'convertible_note',
 'debt_financing',
 'angel',
 'grant',
 'private_equity',
 'post_ipo_equity',
 'post_ipo_debt',
 'secondary_market',
 'product_crowdfunding',
 'round_A',
 'round_B',
 'round_C',
 'round_D',
 'round_E',
 'round_F',
 'round_G',
 'round_H']

In [8]:
data.printSchema()

root
 |-- permalink: string (nullable = true)
 |-- name: string (nullable = true)
 |-- homepage_url: string (nullable = true)
 |-- category_list: string (nullable = true)
 |--  market : string (nullable = true)
 |--  funding_total_usd : string (nullable = true)
 |-- status: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- funding_rounds: integer (nullable = true)
 |-- founded_at: timestamp (nullable = true)
 |-- founded_month: string (nullable = true)
 |-- founded_quarter: string (nullable = true)
 |-- founded_year: integer (nullable = true)
 |-- first_funding_at: timestamp (nullable = true)
 |-- last_funding_at: timestamp (nullable = true)
 |-- seed: integer (nullable = true)
 |-- venture: long (nullable = true)
 |-- equity_crowdfunding: integer (nullable = true)
 |-- undisclosed: integer (nullable = true)
 |-- convertible_note: integer (nullable

## Create a temprary table

This is an essential step to use a SQL query

In [9]:
data.createOrReplaceTempView('investment')

In [10]:
spark.sql("select * from investment limit 10").show()

+--------------------+------------------+--------------------+--------------------+-------------+-------------------+---------+------------+----------+--------------------+------------+--------------+-------------------+-------------+---------------+------------+-------------------+-------------------+-------+-------+-------------------+-----------+----------------+--------------+-----+-----+--------------+---------------+-------------+----------------+--------------------+-------+-------+-------+-------+-------+-------+-------+-------+
|           permalink|              name|        homepage_url|       category_list|      market | funding_total_usd |   status|country_code|state_code|              region|        city|funding_rounds|         founded_at|founded_month|founded_quarter|founded_year|   first_funding_at|    last_funding_at|   seed|venture|equity_crowdfunding|undisclosed|convertible_note|debt_financing|angel|grant|private_equity|post_ipo_equity|post_ipo_debt|secondary_market|

In [11]:
spark.sql("select * from investment where country_code == 'USA'").show()

+--------------------+------------------+--------------------+--------------------+--------------------+-------------------+---------+------------+----------+--------------------+-----------------+--------------+-------------------+-------------+---------------+------------+-------------------+-------------------+-------+--------+-------------------+-----------+----------------+--------------+-----+-----+--------------+---------------+-------------+----------------+--------------------+-------+-------+-------+-------+-------+-------+-------+-------+
|           permalink|              name|        homepage_url|       category_list|             market | funding_total_usd |   status|country_code|state_code|              region|             city|funding_rounds|         founded_at|founded_month|founded_quarter|founded_year|   first_funding_at|    last_funding_at|   seed| venture|equity_crowdfunding|undisclosed|convertible_note|debt_financing|angel|grant|private_equity|post_ipo_equity|post_

In [12]:
usstrartsup = spark.sql("select * from investment where country_code == 'USA'")

In [13]:
usstrartsup.show()

+--------------------+------------------+--------------------+--------------------+--------------------+-------------------+---------+------------+----------+--------------------+-----------------+--------------+-------------------+-------------+---------------+------------+-------------------+-------------------+-------+--------+-------------------+-----------+----------------+--------------+-----+-----+--------------+---------------+-------------+----------------+--------------------+-------+-------+-------+-------+-------+-------+-------+-------+
|           permalink|              name|        homepage_url|       category_list|             market | funding_total_usd |   status|country_code|state_code|              region|             city|funding_rounds|         founded_at|founded_month|founded_quarter|founded_year|   first_funding_at|    last_funding_at|   seed| venture|equity_crowdfunding|undisclosed|convertible_note|debt_financing|angel|grant|private_equity|post_ipo_equity|post_

In [16]:
usstrartsup.createOrReplaceTempView('usstartup_table')

In [17]:
spark.sql("select * from usstartup_table limit 10").show()

+--------------------+------------------+--------------------+--------------------+--------------------+-------------------+---------+------------+----------+--------------------+-----------------+--------------+-------------------+-------------+---------------+------------+-------------------+-------------------+-------+-------+-------------------+-----------+----------------+--------------+-----+-----+--------------+---------------+-------------+----------------+--------------------+-------+-------+-------+-------+-------+-------+-------+-------+
|           permalink|              name|        homepage_url|       category_list|             market | funding_total_usd |   status|country_code|state_code|              region|             city|funding_rounds|         founded_at|founded_month|founded_quarter|founded_year|   first_funding_at|    last_funding_at|   seed|venture|equity_crowdfunding|undisclosed|convertible_note|debt_financing|angel|grant|private_equity|post_ipo_equity|post_ip

### Find most seeded startup in USA

In [19]:
most_seed = spark.sql("select * from usstartup_table order by seed desc")

In [20]:
most_seed.select('name','seed').show()

+--------------------+---------+
|                name|     seed|
+--------------------+---------+
|Space Exploration...|100000000|
|Wikimedia Foundation| 64000000|
|        PharmaNation| 25000000|
|            HeyAnita| 22300000|
|         RetailMeNot| 21000000|
|        ChoiceStream| 15000000|
|NorthStar Systems...| 11500000|
|Silverback Enterp...| 11500000|
|          NuScriptRx| 10479384|
|              Kensho| 10000000|
|          Ice Energy| 10000000|
|    Pepex Biomedical| 10000000|
|              Koding|  9250000|
|   Raise Marketplace|  9000000|
|          Apportable|  8700000|
|  Riverside Research|  8700000|
|          YouScience|  8500000|
|             eBureau|  8000000|
|              KidZui|  8000000|
|       Urban Compass|  8000000|
+--------------------+---------+
only showing top 20 rows



In [21]:
most_seed.head(1)

[Row(permalink='/organization/space-exploration-technologies', name='Space Exploration Technologies', homepage_url='http://www.spacex.com', category_list='|Public Transportation|',  market =' Public Transportation ',  funding_total_usd =' 24,54,60,000 ', status='operating', country_code='USA', state_code='CA', region='Los Angeles', city='Hawthorne', funding_rounds=6, founded_at=datetime.datetime(2002, 1, 1, 0, 0), founded_month='2002-01', founded_quarter='2002-Q1', founded_year=2002, first_funding_at=datetime.datetime(2006, 3, 1, 0, 0), last_funding_at=datetime.datetime(2012, 12, 21, 0, 0), seed=100000000, venture=145460000, equity_crowdfunding=0, undisclosed=0, convertible_note=0, debt_financing=0, angel=0, grant=0, private_equity=0, post_ipo_equity=0, post_ipo_debt=0, secondary_market=0, product_crowdfunding=0, round_A=20000000, round_B=30435000, round_C=50000000, round_D=0, round_E=0, round_F=0, round_G=0, round_H=0)]

In [23]:
most_seed_startup = most_seed.head(1)[0].asDict()

In [24]:
most_seed_startup

{'permalink': '/organization/space-exploration-technologies',
 'name': 'Space Exploration Technologies',
 'homepage_url': 'http://www.spacex.com',
 'category_list': '|Public Transportation|',
 ' market ': ' Public Transportation ',
 ' funding_total_usd ': ' 24,54,60,000 ',
 'status': 'operating',
 'country_code': 'USA',
 'state_code': 'CA',
 'region': 'Los Angeles',
 'city': 'Hawthorne',
 'funding_rounds': 6,
 'founded_at': datetime.datetime(2002, 1, 1, 0, 0),
 'founded_month': '2002-01',
 'founded_quarter': '2002-Q1',
 'founded_year': 2002,
 'first_funding_at': datetime.datetime(2006, 3, 1, 0, 0),
 'last_funding_at': datetime.datetime(2012, 12, 21, 0, 0),
 'seed': 100000000,
 'venture': 145460000,
 'equity_crowdfunding': 0,
 'undisclosed': 0,
 'convertible_note': 0,
 'debt_financing': 0,
 'angel': 0,
 'grant': 0,
 'private_equity': 0,
 'post_ipo_equity': 0,
 'post_ipo_debt': 0,
 'secondary_market': 0,
 'product_crowdfunding': 0,
 'round_A': 20000000,
 'round_B': 30435000,
 'round_C': 

Only row objects can be converted to dict. by converting a row to dictionary indexing becomes easier. 

In [25]:
most_seed_startup['venture']

145460000

In [26]:
most_seed_startup['first_funding_at']

datetime.datetime(2006, 3, 1, 0, 0)