In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Importing the libraries

In [2]:
import findspark
from elasticsearch import Elasticsearch
import requests
from pprint import pprint

### Finding locally installed Spark and importing PySpark libraries

In [3]:
findspark.init("/usr/local/spark/")
from pyspark.sql import SparkSession, functions as func

### Checking if Elasticsearch has successfully been connected

In [4]:
res = requests.get('http://localhost:9200')
pprint(res.content)

(b'{\n  "name" : "ip-172-31-14-175",\n  "cluster_name" : "elasticsearch",\n  "'
 b'cluster_uuid" : "ypf0RlCFQYytEO-JNW-MCg",\n  "version" : {\n    "number" :'
 b' "7.9.2",\n    "build_flavor" : "default",\n    "build_type" : "tar",\n    '
 b'"build_hash" : "d34da0ea4a966c4e49417f2da2f244e3e97b4e6e",\n    "build_da'
 b'te" : "2020-09-23T00:45:33.626720Z",\n    "build_snapshot" : false,\n    "'
 b'lucene_version" : "8.6.2",\n    "minimum_wire_compatibility_version" : "6'
 b'.8.0",\n    "minimum_index_compatibility_version" : "6.0.0-beta1"\n  },\n  '
 b'"tagline" : "You Know, for Search"\n}\n')


### Creating an Elasticsearch object

In [5]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

### Creating Spark Session

In [6]:
spark = SparkSession.builder.appName('task').getOrCreate()

### Reading CSV with Spark and creating Dataframe

In [7]:
df = spark.read.csv('/home/ubuntu/10000 Records.csv', header=True, inferSchema = True).fillna(0)["Emp ID", "Month Name of Joining", "Last Name", "Gender", "E Mail", "SSN", "County", "State", "Region", "City", "Zip", "Salary", func.regexp_replace(func.col("Last % Hike"), "%", "").alias("Salary Hike")]
df.show(5)

+------+---------------------+---------+------+--------------------+-----------+------------+-----+---------+-------------+-----+------+-----------+
|Emp ID|Month Name of Joining|Last Name|Gender|              E Mail|        SSN|      County|State|   Region|         City|  Zip|Salary|Salary Hike|
+------+---------------------+---------+------+--------------------+-----------+------------+-----+---------+-------------+-----+------+-----------+
|198429|             February|Bumgarner|     F|serafina.bumgarne...|063-02-3609|  Chautauqua|   NY|Northeast|       Clymer|14724| 69294|         14|
|178566|                 June|     Rojo|     F|juliette.rojo@yah...|671-48-9915|  Montgomery|   PA|Northeast|     Glenside|19038|193912|         27|
|647173|              January| Krawczyk|     M|milan.krawczyk@ho...|527-99-6328|Anne Arundel|   MD|    South|Gibson Island|21056|123681|         11|
|847634|                  May|    Jason|     M|elmer.jason@yahoo...|063-02-5994|  Washington|   VA|    Sou

### Checking the total records in the PySpark Dataframe

In [8]:
df.count()

10000

### Checking the data types

In [9]:
df.dtypes

[('Emp ID', 'int'),
 ('Month Name of Joining', 'string'),
 ('Last Name', 'string'),
 ('Gender', 'string'),
 ('E Mail', 'string'),
 ('SSN', 'string'),
 ('County', 'string'),
 ('State', 'string'),
 ('Region', 'string'),
 ('City', 'string'),
 ('Zip', 'int'),
 ('Salary', 'int'),
 ('Salary Hike', 'string')]

### Casting 'string' value to 'int'

In [10]:
df = (df.withColumn("Salary Hike", df["Salary Hike"].cast("int")))
df.dtypes

[('Emp ID', 'int'),
 ('Month Name of Joining', 'string'),
 ('Last Name', 'string'),
 ('Gender', 'string'),
 ('E Mail', 'string'),
 ('SSN', 'string'),
 ('County', 'string'),
 ('State', 'string'),
 ('Region', 'string'),
 ('City', 'string'),
 ('Zip', 'int'),
 ('Salary', 'int'),
 ('Salary Hike', 'int')]

### Checking if the changes have been made successfully

In [11]:
df.show()

+------+---------------------+---------+------+--------------------+-----------+------------+-----+---------+---------------+-----+------+-----------+
|Emp ID|Month Name of Joining|Last Name|Gender|              E Mail|        SSN|      County|State|   Region|           City|  Zip|Salary|Salary Hike|
+------+---------------------+---------+------+--------------------+-----------+------------+-----+---------+---------------+-----+------+-----------+
|198429|             February|Bumgarner|     F|serafina.bumgarne...|063-02-3609|  Chautauqua|   NY|Northeast|         Clymer|14724| 69294|         14|
|178566|                 June|     Rojo|     F|juliette.rojo@yah...|671-48-9915|  Montgomery|   PA|Northeast|       Glenside|19038|193912|         27|
|647173|              January| Krawczyk|     M|milan.krawczyk@ho...|527-99-6328|Anne Arundel|   MD|    South|  Gibson Island|21056|123681|         11|
|847634|                  May|    Jason|     M|elmer.jason@yahoo...|063-02-5994|  Washington| 

### Checking the Schema

In [12]:
df.printSchema()

root
 |-- Emp ID: integer (nullable = true)
 |-- Month Name of Joining: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- E Mail: string (nullable = true)
 |-- SSN: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Salary Hike: integer (nullable = true)



### Ingesting Pyspark Dataframe in Elasticsearch directly

In [13]:
df.write.format(
    "org.elasticsearch.spark.sql"
).option(
    "es.resource", '%s' % ('humans')
).option(
    "es.nodes", 'localhost'
).option(
    "es.port", '9200'
).save()

### Setting the output limit

In [14]:
es.indices.put_settings(index="humans", body = {"index" : { "max_result_window" : 5000000 }})

{'acknowledged': True}

### Counting the records in index to see if PySpark Dataframe has been successfully ingested

In [15]:
es.count(index= "humans")

{'count': 10000,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}

### Task 1: Count the number of employees in each County, Region and City

In [16]:
es.search(index="humans", size=0, body={"query": {"match_all": {}}, "aggs":{"No. of Employees in per County" : {"terms":{"field": "County.keyword"}}}})

{'took': 11,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'No. of Employees in per County': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 9165,
   'buckets': [{'key': 'Los Angeles', 'doc_count': 122},
    {'key': 'Jefferson', 'doc_count': 119},
    {'key': 'Washington', 'doc_count': 105},
    {'key': 'Montgomery', 'doc_count': 98},
    {'key': 'Jackson', 'doc_count': 76},
    {'key': 'Franklin', 'doc_count': 73},
    {'key': 'Orange', 'doc_count': 70},
    {'key': 'District of Columbia', 'doc_count': 58},
    {'key': 'Polk', 'doc_count': 58},
    {'key': 'Clark', 'doc_count': 56}]}}}

In [17]:
es.search(index="humans", size=0, body={"query": {"match_all": {}}, "aggs":{"No. of Employees per Region" : {"terms":{"field": "Region.keyword"}}}})

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'No. of Employees per Region': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 0,
   'buckets': [{'key': 'South', 'doc_count': 3652},
    {'key': 'Midwest', 'doc_count': 2821},
    {'key': 'Northeast', 'doc_count': 1777},
    {'key': 'West', 'doc_count': 1750}]}}}

In [18]:
es.search(index="humans", size=0, body={"query": {"match_all": {}}, "aggs":{"No. of Employees per City" : {"terms":{"field": "City.keyword"}}}})

{'took': 6,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'No. of Employees per City': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 9661,
   'buckets': [{'key': 'Washington', 'doc_count': 69},
    {'key': 'El Paso', 'doc_count': 39},
    {'key': 'Houston', 'doc_count': 35},
    {'key': 'New York City', 'doc_count': 33},
    {'key': 'Miami', 'doc_count': 30},
    {'key': 'Sacramento', 'doc_count': 28},
    {'key': 'Dallas', 'doc_count': 27},
    {'key': 'Richmond', 'doc_count': 27},
    {'key': 'Phoenix', 'doc_count': 26},
    {'key': 'Atlanta', 'doc_count': 25}]}}}

### Task 2: Generate Employee Summary

In [19]:
es.search(index="humans", size=5, body={"query": {"match_all": {}}})

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'humans',
    '_type': '_doc',
    '_id': '9FFvDnUBFUuBiHlmB95l',
    '_score': 1.0,
    '_source': {'Emp ID': 198429,
     'Month Name of Joining': 'February',
     'Last Name': 'Bumgarner',
     'Gender': 'F',
     'E Mail': 'serafina.bumgarner@exxonmobil.com',
     'SSN': '063-02-3609',
     'County': 'Chautauqua',
     'State': 'NY',
     'Region': 'Northeast',
     'City': 'Clymer',
     'Zip': 14724,
     'Salary': 69294,
     'Salary Hike': 14}},
   {'_index': 'humans',
    '_type': '_doc',
    '_id': '9VFvDnUBFUuBiHlmB95l',
    '_score': 1.0,
    '_source': {'Emp ID': 178566,
     'Month Name of Joining': 'June',
     'Last Name': 'Rojo',
     'Gender': 'F',
     'E Mail': 'juliette.rojo@yahoo.co.uk',
     'SSN': '671-48-9915',
     'County': 'Montgomery',
     'State': 'PA',
     

### Task 3: Generate employee summary and ordering by Gender and Salary

In [20]:
es.search(index="humans", size=5, body={"query": {"match_all": {}}, "sort":[{"Gender.keyword":{"order":"asc"}}, {"Salary":{"order":"asc"}}]})

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'eq'},
  'max_score': None,
  'hits': [{'_index': 'humans',
    '_type': '_doc',
    '_id': 'wlFvDnUBFUuBiHlmCOVc',
    '_score': None,
    '_source': {'Emp ID': 338798,
     'Month Name of Joining': 'May',
     'Last Name': 'Conley',
     'Gender': 'F',
     'E Mail': 'jodi.conley@charter.net',
     'SSN': '084-02-6421',
     'County': 'Los Angeles',
     'State': 'CA',
     'Region': 'West',
     'City': 'Glendale',
     'Zip': 91210,
     'Salary': 40009,
     'Salary Hike': 1},
    'sort': ['F', 40009]},
   {'_index': 'humans',
    '_type': '_doc',
    '_id': 'jlFvDnUBFUuBiHlmC_3E',
    '_score': None,
    '_source': {'Emp ID': 644604,
     'Month Name of Joining': 'August',
     'Last Name': 'Brazell',
     'Gender': 'F',
     'E Mail': 'lorina.brazell@ntlworld.com',
     'SSN': '308-37-1932',
     'County': 'Lucas',
     'State': '

### Task 4: Summerize the number of employee joined and hikes granted based on month

In [21]:
es.search(index="humans", size=0, body={"query": {"match_all": {}}, "aggs":{"No. of Employees joined in particular month" : {"terms":{"field": "Month Name of Joining.keyword"}}}})

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'No. of Employees joined in particular month': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 1568,
   'buckets': [{'key': 'July', 'doc_count': 902},
    {'key': 'June', 'doc_count': 895},
    {'key': 'March', 'doc_count': 867},
    {'key': 'April', 'doc_count': 851},
    {'key': 'May', 'doc_count': 846},
    {'key': 'January', 'doc_count': 837},
    {'key': 'December', 'doc_count': 827},
    {'key': 'October', 'doc_count': 811},
    {'key': 'August', 'doc_count': 808},
    {'key': 'November', 'doc_count': 788}]}}}

In [22]:
es.search(index="humans", size=0, body={"query": {"match_all": {}}, "aggs":{"No. of Hikes granted per Month": {"cardinality": {"field": "Salary Hike"}}}})

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'No. of Hikes granted per Month': {'value': 31}}}

### Task 5: Generate employee summary and ordering by Salary

In [23]:
es.search(index="humans", size=5, body={"query":{"match_all":{}}, "sort":[{"Salary":{"order":"asc"}}]})

{'took': 3,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'eq'},
  'max_score': None,
  'hits': [{'_index': 'humans',
    '_type': '_doc',
    '_id': 't1JvDnUBFUuBiHlmDAXc',
    '_score': None,
    '_source': {'Emp ID': 373347,
     'Month Name of Joining': 'December',
     'Last Name': 'Shaner',
     'Gender': 'M',
     'E Mail': 'royce.shaner@hotmail.com',
     'SSN': '056-02-4379',
     'County': 'East Freetown',
     'State': 'NY',
     'Region': 'Northeast',
     'City': 'East Freetown',
     'Zip': 13055,
     'Salary': 40007,
     'Salary Hike': 24},
    'sort': [40007]},
   {'_index': 'humans',
    '_type': '_doc',
    '_id': 'ZFFvDnUBFUuBiHlmC_rD',
    '_score': None,
    '_source': {'Emp ID': 449798,
     'Month Name of Joining': 'February',
     'Last Name': 'Cleaves',
     'Gender': 'M',
     'E Mail': 'ben.cleaves@bellsouth.net',
     'SSN': '369-39-2187',
     'County': 'Montrose'

### Deleting the index from elasticsearch

In [24]:
es.indices.delete(index='humans')

{'acknowledged': True}

### Stopping the SparkSession

In [25]:
spark.stop()
print("Successfully stopped SparkSession!")

Successfully stopped SparkSession!
