In [1]:
# # **Loading Hive Tables and Data Preparation for Analysis**

import pyspark.sql.functions as F
from pyspark.sql.functions import col, year, to_date, greatest, count, max
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("E2E_Capstone").config(
    "spark.ui.port", "0").config(
        "spark.sql.catalogImplementation=hive").config(
        "spark.sql.warehouse.dir",
        "hdfs://nameservice1/user/itv003722/warehouse/comment_analysis.db/review").config(
            "spark.serializer",
    "org.apache.spark.serializer.KryoSerializer").enableHiveSupport().getOrCreate()
spark.sparkContext.setLogLevel('OFF')

In [3]:
# Creating Local Views and Spark Dataframes to call these objects from memory.

review = spark.table('comment_analysis.review')
review.createOrReplaceTempView('review')

In [4]:
review.show()

+-----+---------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+---------------+------------------+--------------------+--------------------------+------------------+----------------------+---------+
|index|  company|               state| year|           job_title|             summary|                pros|                cons|overall_ratings|work_balance_stars|culture_values_stars|carrer_opportunities_stars|comp_benefit_stars|senior_mangemnet_stars|  country|
+-----+---------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+---------------+------------------+--------------------+--------------------------+------------------+----------------------+---------+
|65024|microsoft|Caracas; Capital ...| 2015|Current Employee ...|Services Delivery...|The company where...|Difficult to chan...|              4|                 5|                   4|                        

In [5]:
review.printSchema()


root
 |-- index: integer (nullable = true)
 |-- company: string (nullable = true)
 |-- state: string (nullable = true)
 |-- year: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- pros: string (nullable = true)
 |-- cons: string (nullable = true)
 |-- overall_ratings: integer (nullable = true)
 |-- work_balance_stars: integer (nullable = true)
 |-- culture_values_stars: integer (nullable = true)
 |-- carrer_opportunities_stars: integer (nullable = true)
 |-- comp_benefit_stars: integer (nullable = true)
 |-- senior_mangemnet_stars: integer (nullable = true)
 |-- country: string (nullable = true)



In [6]:
review.select('company','overall_ratings').show(10)

+---------+---------------+
|  company|overall_ratings|
+---------+---------------+
|    apple|              5|
|microsoft|              1|
|microsoft|              4|
|microsoft|              4|
|microsoft|              3|
|microsoft|              3|
|    apple|              1|
|microsoft|              2|
|microsoft|              3|
|microsoft|              4|
+---------+---------------+
only showing top 10 rows



In [7]:
from pyspark.sql.functions import countDistinct
df=review.select(countDistinct("company","country"))
df.show()
# print("Distinct Count of company & country: "+ str(df.collect()[0][0]))

+--------------------------------+
|count(DISTINCT company, country)|
+--------------------------------+
|                             296|
+--------------------------------+



In [8]:
#Using the over-all rating fields display trend:
#1. Globally by company: Identify trends at 25%, 50%, 75%
spark.sql("""
            select company,percentile(cast(overall_ratings as bigint), 0.25) as 25_perentage,percentile(cast(overall_ratings as bigint), 0.50) as 50_perentage,percentile(cast(overall_ratings as bigint), 0.75) as 75_perentage   from review
group by company
         """).show()

+--------------------+------------+------------+------------+
|             company|25_perentage|50_perentage|75_perentage|
+--------------------+------------+------------+------------+
|The Cloud team do...|         4.0|         4.0|         4.0|
|           microsoft|         3.0|         4.0|         5.0|
|              amazon|         3.0|         4.0|         5.0|
|             netflix|         2.0|         4.0|         5.0|
|               apple|         3.0|         4.0|         5.0|
|            facebook|         4.0|         5.0|         5.0|
|              google|         4.0|         5.0|         5.0|
+--------------------+------------+------------+------------+



In [9]:
#2. Globally by company per year: Identify trends at 25%, 50%, 75%
d=spark.sql("""
            select company,year,percentile(cast(overall_ratings as bigint), 0.25) as 25_perentage,percentile(cast(overall_ratings as bigint), 0.50) as 50_perentage,percentile(cast(overall_ratings as bigint), 0.75) as 75_perentage   from review
group by company,year order by year asc
         """).show()

+---------+-----+------------+------------+------------+
|  company| year|25_perentage|50_perentage|75_perentage|
+---------+-----+------------+------------+------------+
|  netflix| 0000|         4.0|         4.0|         4.0|
|   amazon| 0000|         4.0|         4.0|         4.0|
|    apple| 0000|         4.0|         4.0|         4.0|
|   google| 2008|         2.0|         4.0|         5.0|
| facebook| 2008|         3.5|         4.0|         5.0|
|   amazon| 2008|         2.0|         3.0|         4.0|
|  netflix| 2008|         4.0|         5.0|         5.0|
|    apple| 2008|         3.0|         4.0|         5.0|
|microsoft| 2008|         3.0|         4.0|         4.0|
|   amazon| 2009|         2.0|         3.5|         4.0|
|  netflix| 2009|         2.0|         3.0|         4.0|
|    apple| 2009|         3.0|         4.0|         5.0|
|   google| 2009|         3.0|         4.0|        4.75|
| facebook| 2009|         5.0|         5.0|         5.0|
|microsoft| 2009|         3.0| 

In [10]:
spark.sql("""
            select distinct(year) from review order by year

         """).show()

+-----+
| year|
+-----+
| 0000|
| 2008|
| 2009|
| 2010|
| 2011|
| 2012|
| 2013|
| 2014|
| 2015|
| 2016|
| 2017|
| 2018|
|    1|
| None|
+-----+



In [22]:
#3. By company by country (Identify trends for each company by country: Identify trends at 25%, 50%, 75%
spark.sql("""
            select company,country,percentile(cast(overall_ratings as bigint), 0.25) as 25_perentage,percentile(cast(overall_ratings as bigint), 0.50) as 50_perentage,percentile(cast(overall_ratings as bigint), 0.75) as 75_perentage   from review
group by company,country 
         """).show()

+---------+--------------------+------------+------------+------------+
|  company|             country|25_perentage|50_perentage|75_perentage|
+---------+--------------------+------------+------------+------------+
|   amazon|             Ireland|         2.5|         4.0|         5.0|
|    apple|           Australia|         3.0|         4.0|         5.0|
|   amazon|              Brazil|         5.0|         5.0|         5.0|
|   amazon|            Colombia|         3.0|         3.0|         3.0|
|    apple|            Pakistan|        4.25|         4.5|        4.75|
|    apple|       Cote d'Ivoire|         5.0|         5.0|         5.0|
| facebook|             Denmark|         4.0|         4.0|         4.0|
|microsoft|             Lebanon|         5.0|         5.0|         5.0|
|   amazon|                  UK|         2.0|         4.0|         5.0|
|   google|               Chile|         2.0|         2.0|         2.0|
|microsoft|__HIVE_DEFAULT_PA...|         3.0|         4.0|      

In [None]:
##Display the impact of employee status on rating a company using the overall-ratings field by the company by year.

spark.sql("""
select year,company,overall_ratings,sum(or_count) as total from (
           select year,company,job_title,overall_ratings ,count(overall_ratings) as or_count from review
        group by company,year ,job_title having count(overall_ratings)>1 order by year, overall_ratings desc
        )group by year,company
         """).show()



In [None]:
review.groupBy('year','job_title','overall_ratings').agg(
    count('overall_ratings').alias('rating_count')).orderBy(
        col('rating_count').desc()).show()

In [25]:
#Display the relationship between the overall rating score vs. the rest of the rating field scores by company.

spark.sql("""select company,overall_ratings,work_balance_stars,culture_values_stars,carrer_opportunities_stars,comp_benefit_stars,senior_mangemnet_stars from review
  """).show()

+---------+---------------+------------------+--------------------+--------------------------+------------------+----------------------+
|  company|overall_ratings|work_balance_stars|culture_values_stars|carrer_opportunities_stars|comp_benefit_stars|senior_mangemnet_stars|
+---------+---------------+------------------+--------------------+--------------------------+------------------+----------------------+
|microsoft|              4|                 2|                   4|                         4|                 4|                     3|
|microsoft|              4|                 2|                   4|                         4|                 4|                     3|
|microsoft|              5|                 4|                   5|                         5|                 4|                     4|
|    apple|              3|                 5|                   3|                         4|                 5|                     3|
|microsoft|              4|              

In [None]:
#Which corporation is worth working for

In [32]:

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
windowSpec  = Window.partitionBy("year").orderBy("overall_ratings")

df=review.withColumn("row_number",row_number().over(windowSpec)) \
    .show(truncate=False)


+-----+-------+----------------+-----+--------------------------------------------+--------------------------------------------------------------------------------------------------+----+----+---------------+------------------+--------------------+--------------------------+------------------+----------------------+--------------------------+----------+
|index|company|state           |year |job_title                                   |summary                                                                                           |pros|cons|overall_ratings|work_balance_stars|culture_values_stars|carrer_opportunities_stars|comp_benefit_stars|senior_mangemnet_stars|country                   |row_number|
+-----+-------+----------------+-----+--------------------------------------------+--------------------------------------------------------------------------------------------------+----+----+---------------+------------------+--------------------+--------------------------+-------------

In [None]:
df.select("year","company","overall_ratings").show()

In [26]:
#Classification of satisfied or unsatisfied employees?
from pyspark.sql.functions import when
df2 = review.withColumn("overall_ratings", when(review.overall_ratings > "3","Satisfied")
                                 .when(review.overall_ratings < "3","Unstatisfied")
                                 .when(review.overall_ratings.isNull() ,"")
                                 .otherwise(review.overall_ratings))
df2.show()

+-----+---------+---------------+-----+--------------------+--------------------+----+----+---------------+------------------+--------------------+--------------------------+------------------+----------------------+-------+
|index|  company|          state| year|           job_title|             summary|pros|cons|overall_ratings|work_balance_stars|culture_values_stars|carrer_opportunities_stars|comp_benefit_stars|senior_mangemnet_stars|country|
+-----+---------+---------------+-----+--------------------+--------------------+----+----+---------------+------------------+--------------------+--------------------------+------------------+----------------------+-------+
|66331|microsoft|        Vedbæk | 2013|Former Employee -...|Great opportuniti...|null|null|              3|                 4|                   2|                         5|                 4|                     3|Denmark|
|66296|microsoft| Over Feldborg | 2013|Former Employee -...|awesome amazing w...|null|null|      Sat

In [36]:
df2.select('company').show(10)

+---------+
|  company|
+---------+
|microsoft|
|microsoft|
|microsoft|
|microsoft|
|    apple|
|   google|
|microsoft|
|microsoft|
|    apple|
|   amazon|
+---------+
only showing top 10 rows



In [8]:
spark.sql("""
       select company , sum(satisfied) as satisfied,sum(unsatisfied) as unstatisfied from 
       ( select company, (case when overall_ratings>=3 then 1 else 0 end) as satisfied,
       (case when overall_ratings<3  then 1 else 0 end)  as unsatisfied from review
 )a group by a.company
""").show()

+--------------------+---------+------------+
|             company|satisfied|unstatisfied|
+--------------------+---------+------------+
|The Cloud team do...|        1|           0|
|           microsoft|    15983|        1947|
|              amazon|    21218|        5212|
|             netflix|      578|         232|
|               apple|    11652|        1298|
|            facebook|     1494|          96|
|              google|     1899|         100|
+--------------------+---------+------------+



In [9]:
spark.sql("""
   select distinct company, count(overall_ratings) from review where overall_ratings=5 group by company 
""").show(6)

+---------+----------------------+
|  company|count(overall_ratings)|
+---------+----------------------+
|microsoft|                  5063|
|   amazon|                  7553|
|  netflix|                   240|
|    apple|                  4805|
| facebook|                  1151|
|   google|                  1050|
+---------+----------------------+



In [6]:
spark.sql("""
  
   select distinct company from review  
""").show()

+--------------------+
|             company|
+--------------------+
|The Cloud team do...|
|           microsoft|
|              amazon|
|             netflix|
|               apple|
|            facebook|
|              google|
+--------------------+



In [9]:
spark.sql("""
   select company,summary from review where  overall_ratings=5
""").show(10)

+---------+--------------------+
|  company|             summary|
+---------+--------------------+
|microsoft|Busy with increas...|
| facebook|A workplace with ...|
|microsoft|One of the best c...|
|    apple|high salary and g...|
|    apple|        Art director|
|    apple|       customer care|
|    apple|              genius|
|microsoft|         Vendor role|
|microsoft|              Intern|
|    apple|   Marketing Manager|
+---------+--------------------+
only showing top 10 rows



In [12]:
df2=spark.sql("""
   select summary from review where  overall_ratings<3
""").show()

+--------------------+
|             summary|
+--------------------+
|       Not Favorable|
|                 TSS|
|senior purchase e...|
|Microsoft Advance...|
|          Consultant|
|Think before you ...|
| Horrible Management|
|     Microsoft Japan|
|              Amazon|
|   Marketing Manager|
|            Ad Rater|
|          Sr Manager|
|Data Center Techn...|
|Interesting but b...|
|Microsoft changed...|
| Soltuion Specialist|
|Really depends on...|
|        Good company|
|Don't work for th...|
|Microsoft Middle ...|
+--------------------+
only showing top 20 rows

