In [None]:
pip install matplotlib

In [None]:
pip install gensim

In [None]:
pip install plotly

In [None]:
pip install smart_open

In [None]:
pip install yellowbrick

In [None]:
pip install scipy

In [None]:
pip install sklearn


# Glassdoor.com Data Cleaning and EDA
![alt text](https://fistfuloftalent.com/wp-content/uploads/2018/03/logo-1200x630.png "logo glassdoor")

[Glassdoor](https://www.glassdoor.com) is a website where current and former employees anonymously review companies. Glassdoor also allows users to anonymously submit and view salaries as well as search and apply for jobs on its platform.

In 2018, the company was acquired by the Japanese firm, Recruit Holdings, for US$1.2 billion. The company is headquartered in Mill Valley, California, with additional offices in Chicago, Dublin, London, and São Paulo.<a href=https://en.wikipedia.org/wiki/Glassdoor> wikipedia </a>

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

import string 
from gensim.parsing.preprocessing import remove_stopwords

from yellowbrick.text import FreqDistVisualizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
df = pd.read_csv('../input/glassdoor-jobs-data-analysis/glassdoor job posting test 14Oct20.csv')
df

In [None]:
#remove Salary Estimate columns since we not get the data from scrapping 
df.drop(columns='Salary Estimate', inplace=True)

#rename coloumns
df.rename({'Job Title': 'job_title',
           'Job Description':'job_desc',
           'Rating':'rating',
           'Company Name':'company',
           'Location':'location',
           'Size':'size',
           'Founded':'founded',
           'Type of ownership':'type',
          'Industry':'industry',
          'Sector':'sector',
          'Revenue':'revenue'}, axis=1,inplace=True)

#convert whole dataframe to lower case
df = df.apply(lambda x: x.astype(str).str.lower())

In [None]:
df

In [None]:
#get real company name by split on the \n  
df['company'] = df['company'].str.split('\n', n=2, expand=True)
df['company']

In [None]:
df.info()

In [None]:
df['rating'] = df['rating'].astype(float)
df['job_desc'] = df['job_desc'].astype(str)
df['job_title'] = df['job_title'].astype(str)
df.info()

In [None]:
df['location'].value_counts()

In [None]:
df['location'].value_counts().plot.bar()

In [None]:

fig = go.Figure()
fig.add_traces(go.Pie(values = df.location.value_counts()[:5].to_list(),
                    labels= df.location.value_counts()[:5].index.to_list(),
                    name = 'location',textposition = 'inside' , textinfo = 'percent+label'))
fig.update_layout(template = 'ggplot2',title = 'Data Scientist with most number of location in Indonesia',
                showlegend = False )
fig.show()

In [None]:
df['industry'].value_counts()

In [None]:
df['industry'].value_counts().plot.bar()

In [None]:
fig = go.Figure()
fig.add_traces(go.Pie(values = df.industry.value_counts()[:10].to_list(),
                    labels= df.industry.value_counts()[:10].index.to_list(),
                    name = 'Industry',textposition = 'inside' , textinfo = 'percent+label'))
fig.update_layout(template = 'ggplot2',title = 'Industries with most number of Data Science Related jobs' )
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(y = df['company'].value_counts()[:20].to_list(),
x= df['company'].value_counts()[:20].index.to_list(),
marker_color = 'deepskyblue' , name = "Company"))
fig.update_layout(title= 'Companies with Max Number of Job Postings related to data science',
                template = 'ggplot2')
fig.show()

In [None]:
df['rating']

In [None]:
ratings =df[~df['rating'].isnull()]['rating']
sns.distplot(ratings,kde = True , rug = True)
plt.axvline(np.median(ratings),color='r', linestyle='--')
plt.grid(True)
plt.title("Distribution of Ratings")
plt.show()

In [None]:
df['job_title'].nunique()

In [None]:
fig = go.Figure()
fig.add_traces(go.Pie(values = df.job_title.value_counts()[:10].to_list(),
                    labels= df.job_title.value_counts()[:10].index.to_list(),
                    name = 'Industry',textposition = 'inside' , textinfo = 'percent+label'))
fig.update_layout(template = 'ggplot2',title = 'Job Title with most number of Data Science Related jobs' )
fig.show()

In [None]:
x = df.job_desc[0].replace('\n\n' , '\n')
x = x.split('\n')

print(*x , sep = '\n')

In [None]:
df['job_desc'] = df['job_desc'].replace('\n\n' , " " , regex = True)
df['job_desc'] = df['job_desc'].replace('\n' , " " , regex = True)

df['job_desc'] = df['job_desc'].replace('\n\n' , " " , regex = True)
df['job_desc'] = df['job_desc'].replace('\n' , " " , regex = True)
#Let's remove punctuation and Stopwords

def Remove_puncutations_stopwords(s):
    s = ''.join([i for i in s if i not in string.punctuation])
    s = remove_stopwords(s)
    return s

df['job_desc'] = df['job_desc'].apply(lambda x : Remove_puncutations_stopwords(x))

df['job_desc'] = df['job_desc'].apply(lambda x : Remove_puncutations_stopwords(x))

In [None]:
df['job_desc']

In [None]:
vec = CountVectorizer(min_df= 3 , stop_words = 'english' , ngram_range = (2,2))
docs = vec.fit_transform(df.job_desc)
features = vec.get_feature_names()

visualizer = FreqDistVisualizer(features=features, orient='h' , size = (800,800))
visualizer.fit(docs)
visualizer.show()

In [None]:
vec = CountVectorizer(analyzer='word', ngram_range=(2, 2))
docs = vec.fit_transform(df.job_title)
features = vec.get_feature_names()

visualizer = FreqDistVisualizer(features=features, orient='h' , size = (800,800))
visualizer.fit(docs)
visualizer.show()