In [1]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.functions import col, year, to_date, greatest, count, max
from pyspark.sql import Window
from pyspark.sql.functions import sum, when
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder,OneHotEncoderModel, StringIndexer, VectorAssembler

In [2]:
spark = SparkSession.builder.appName("E2E_Capstone").config(
    "spark.ui.port", "0").config(
        "spark.sql.catalogImplementation=hive").config(
        "spark.sql.warehouse.dir",
        "hdfs://nameservice1/user/itv003722/warehouse/comment_analysis.db/review").config(
            "spark.serializer",
    "org.apache.spark.serializer.KryoSerializer").enableHiveSupport().getOrCreate()
spark.sparkContext.setLogLevel('OFF')

In [3]:
# Creating Local Views and Spark Dataframes to call these objects from memory.

review = spark.table('comment_analysis.review')
review.createOrReplaceTempView('review')

In [4]:
review.show()
review.printSchema()
review.count()

+-----+---------+-------------------+-----+--------------------+--------------------+--------------------+--------------------+---------------+------------------+--------------------+--------------------------+------------------+----------------------+-------+
|index|  company|              state| year|           job_title|             summary|                pros|                cons|overall_ratings|work_balance_stars|culture_values_stars|carrer_opportunities_stars|comp_benefit_stars|senior_mangemnet_stars|country|
+-----+---------+-------------------+-----+--------------------+--------------------+--------------------+--------------------+---------------+------------------+--------------------+--------------------------+------------------+----------------------+-------+
|66367|microsoft|  Haidian; Beijing | 2013|Former Employee -...|It is a great jou...|Great working env...|Internal politics...|              4|                 3|                   3|                         3|       

67529

In [5]:
dfp = review.toPandas()

In [6]:
conti_var_df = dfp.loc[:, (dfp.dtypes == 'float64') | (
    dfp.dtypes == 'int64') | (dfp.dtypes == 'int32')]
cat_var_df = dfp.loc[:, (dfp.dtypes == 'object')]

In [7]:
# ### Continous Variables
def fun_describe(_):
    '''
        Calculates different metrics of Numerical column
    '''
    # Records and missing values
    n_tot = _.shape[0]
    n_count = _.count()
    n_miss = _.isna().sum()
    n_miss_perc = n_miss / n_tot

    # IQR
    q_1 = _.quantile(0.25)
    q_3 = _.quantile(0.75)
    iqr = q_3 - q_1
    lc_iqr = q_1 - 1.5 * iqr
    uc_iqr = q_3 + 1.5 * iqr

    return pd.Series(data=[
        _.dtypes, n_tot, n_count, n_miss, n_miss_perc,
        _.nunique(),
        _.sum(),
        _.mean(),
        _.std(),
        _.var(), iqr, lc_iqr, uc_iqr,
        _.min(),
        _.max(),
        _.quantile(0.01),
        _.quantile(0.05),
        _.quantile(0.1),
        _.quantile(0.25),
        _.quantile(0.5),
        _.quantile(0.75),
        _.quantile(0.90),
        _.quantile(0.95),
        _.quantile(0.99)
    ],
        index=[
        'dtype', 'tot', 'n', 'nmiss', 'miss_perc',
        'cardinality', 'sum', 'mean', 'std', 'var', 'iqr',
        'lc_iqr', 'uc_iqr', 'min', 'max', 'p1', 'p5', 'p10',
        'p25', 'p50', 'p75', 'p90', 'p95', 'p99'
    ])


conti_var_df.apply(fun_describe).T.head(50)



Unnamed: 0,dtype,tot,n,nmiss,miss_perc,cardinality,sum,mean,std,var,...,max,p1,p5,p10,p25,p50,p75,p90,p95,p99
index,int32,67529,67529,0,0,67529,2280116685,33765.0,19494.1,380019000.0,...,67529,676.28,3377.4,6753.8,16883,33765,50647,60776.2,64152.6,66853.7
overall_ratings,int32,67529,67529,0,0,5,258371,3.82607,1.15499,1.334,...,5,1.0,1.0,2.0,3,4,5,5.0,5.0,5.0
work_balance_stars,int32,67529,67529,0,0,5,224056,3.31792,1.23596,1.5276,...,5,1.0,1.0,1.0,3,3,4,5.0,5.0,5.0
culture_values_stars,int32,67529,67529,0,0,5,258480,3.82769,1.16215,1.3506,...,5,1.0,1.0,2.0,3,4,5,5.0,5.0,5.0
carrer_opportunities_stars,int32,67529,67529,0,0,5,247193,3.66055,1.17084,1.37086,...,5,1.0,1.0,2.0,3,4,5,5.0,5.0,5.0
comp_benefit_stars,int32,67529,67529,0,0,5,265402,3.93019,0.990349,0.980792,...,5,1.0,2.0,3.0,3,4,5,5.0,5.0,5.0
senior_mangemnet_stars,int32,67529,67529,0,0,5,221256,3.27646,1.24128,1.54078,...,5,1.0,1.0,1.0,3,3,4,5.0,5.0,5.0


In [8]:
# ### Categorical Variables
def fun_obj_describe(_):
    '''
        Calculates different metrics of Categorical column
    '''
    # Records and missing values
    n_tot = _.shape[0]
    n_count = _.count()
    n_miss = _.isna().sum()
    n_miss_perc = n_miss / n_tot

    return pd.Series(data=[
        _.dtypes, n_tot, n_count, n_miss, n_miss_perc,
        _.nunique()
    ],
        index=[
        'dtype', 'tot', 'n', 'nmiss', 'miss_perc',
        'cardinality'
    ])


cat_var_df.apply(fun_obj_describe).T.head(50)

Unnamed: 0,dtype,tot,n,nmiss,miss_perc,cardinality
company,object,67529,67529,0,0,6
state,object,67529,67529,0,0,699
year,object,67529,67529,0,0,13
job_title,object,67529,67529,0,0,8308
summary,object,67529,67529,0,0,42651
pros,object,67529,67529,0,0,63789
cons,object,67529,67529,0,0,64325
country,object,67529,67529,0,0,106


In [27]:
df2=review.withColumn('Satisfied',\
                  F.when(review.overall_ratings>3,1)\
                  .otherwise(0)
                 ).show(5)

+-----+---------+----------+-----+--------------------+--------------------+--------------------+--------------------+---------------+------------------+--------------------+--------------------------+------------------+----------------------+-------+---------+
|index|  company|     state| year|           job_title|             summary|                pros|                cons|overall_ratings|work_balance_stars|culture_values_stars|carrer_opportunities_stars|comp_benefit_stars|senior_mangemnet_stars|country|Satisfied|
+-----+---------+----------+-----+--------------------+--------------------+--------------------+--------------------+---------------+------------------+--------------------+--------------------------+------------------+----------------------+-------+---------+
|66420|microsoft|Bengaluru | 2013|Current Employee ...|Software Developm...|working with best...|Sometimes the pro...|              4|                 3|                   4|                         4|             

In [None]:
from pyspark_dist_explore import hist
import matplotlib.pyplot as plt

In [None]:

review.hist('Satisfied')

In [4]:
# Encoding columns with high cardinality may effect the model, so all of
# these categorical cols with high cardinality will be dropped later.


# Dropping these following columns since these do not seem to have any effect
# in Employee Left Status :
# 'emp_title_id','birth_date','last_date','hire_date','emp_no',
# 'title_id','dept_no','first_name','last_name'

df = review.drop(
    'index',
    'summary',
    
    'pros',
    'cons',
    'work_balance_stars',
    'culture_values_stars',
    'carrer_opportunities_stars',
    'comp_benefit_stars',
    'senior_mangemnet_stars',
    
     
    
    )
df.toPandas().head(20)

Unnamed: 0,company,state,year,job_title,overall_ratings,country
0,microsoft,Prague,2014,Current Employee - Sales Job,2,Czech Republic
1,amazon,Karlovy Vary,2015,Former Employee - English As A Second Language...,1,Czech Republic
2,microsoft,Prague,2015,Former Employee - Inside Sales Account Manager,5,Czech Republic
3,microsoft,Prague,2015,Former Employee - Anonymous Employee,4,Czech Republic
4,amazon,Prague,2015,Current Employee - IT Engineer,4,Czech Republic
5,microsoft,Prague,2015,Current Employee - Talent Acquisition Manager,4,Czech Republic
6,amazon,Prague,2016,Current Employee - Area Manager,4,Czech Republic
7,microsoft,Prague,2016,Current Employee - Global Human Resources Oper...,5,Czech Republic
8,microsoft,Prague,2016,Former Employee - Sales Executive,4,Czech Republic
9,microsoft,Prague,2016,Former Employee - Anonymous Employee,5,Czech Republic


In [5]:
df.printSchema()

root
 |-- company: string (nullable = true)
 |-- state: string (nullable = true)
 |-- year: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- overall_ratings: integer (nullable = true)
 |-- country: string (nullable = true)



In [None]:
df.select('company').distinct().show()

In [None]:
##df.groupBy('company').agg(sum('overall_ratings'))

df.withColumn('Total yearly Sum', F.sum(
    'overall_ratings').over(Window.partitionBy('year'))).show()

In [None]:
result = df.groupBy("company").agg(sum(when(df.overall_ratings ==5, df.overall_ratings)).alias("sum_value"))
result.toPandas().head(20)


In [None]:
selected_data = df.groupBy("company","Year").agg(
    sum(when(df.overll_rating >= 3,1)).alias("satisfed"),
    sum(when(df.overll_rating <3,1)).alias("Not_satisfed")
).select("company","year" "satisfed", "satisfed")

# Show the selected columns
selected_data.toPandas().head(10)

In [18]:
review.satisfied(False,0.1).toPandas().hist()

AttributeError: 'DataFrame' object has no attribute 'satisfied'

In [None]:
df.select('year').distinct().show()

In [None]:
df.select('job_title').distinct().show()

In [None]:

df.na.drop(subset=["year"]) \
   .show(truncate=False)


In [None]:
df.select('year').distinct().show()

In [None]:
pandas_df = df.select("overall_ratings", "count").toPandas()

# Plot the bar chart using Pandas DataFrame and Matplotlib
pandas_df.plot(x="overall_ratings", y="count", kind="bar")
plt.xlabel("overall_ratings")
plt.ylabel("Count")
plt.title("Bar Chart")
plt.show()