In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.figure_factory as ff
import cufflinks
cufflinks.go_offline()

cufflinks.set_config_file(world_readable=True, theme='pearl', offline=True)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Information you will gain from this notebook:
## 1. how data analysts labour market looks like
### a. what salary they may get
### b. which industry and sector need them
### c. which state offers a lot of vacancies
## 2. What you need to highlight for getting a high salary data analyst job
### a. company size and salary relationship
### b. revenue and salary relationship
### c. rich data analysts state
### d. rich data analyst industry
## 3. Top vacancies from best companies with high salaries
## 4. Information for Juniors
### a. the best state for juniors
### b. the best industry for juniors
### c. which salary juniors may expect

# Also here you can see step by step data preparation

Data columns:
* unnamed
* Job title
* salary estimate
* job description
* rating
* company name
* location
* headquarters
* size
* founded
* type of ownnership
* industry
* sector
* revenue
* competitors
* easy apply

In [None]:
data = pd.read_csv('../input/data-analyst-jobs/DataAnalyst.csv')

### Looking at the data

In [None]:
data.head()

### Taking some information about the data

In [None]:
data.info()

In [None]:
data.describe()

# Data preparation

### Checkinng missing values

In [None]:
print(data.isnull().sum())

### Getting rid of one row with missing value in company name column

In [None]:
data = data.dropna()

### Getting rid of Unnamed column, becouse there is no information

In [None]:
data = data.drop('Unnamed: 0', axis=1)

### Renaming columns for comfortable queries

In [None]:
old_columns = data.columns
new_columns = ['job_title', 'salary', 'description','rating', 'comp_name','location',
              'headquarters','comp_size','founded','ownership_type','industry','sector','revenue',
              'competitors','easy_apply']
data = data.rename(columns=dict(zip(old_columns, new_columns)))

In [None]:
data

### Salary column is represented in text form, so see unique values

In [None]:
data.salary.unique()

### The majority of values is represented in '(min salary) - (max salary) (Glassdoor est.)'form, but there is columns with '-1'

In [None]:
data[data.salary=='-1']

### Replace this '-1' to standard form of '(min salary) - (max salary) (Glassdoor est.)' for next replacement (min and max values for this row is 0)

In [None]:
data[data.salary=='-1'] = data[data.salary=='-1'].replace('-1', '$0K-$0K (Glassdoor est.)')

### Preparation of salary column, where we find max and min salary for each vacancy and then calculate mean salary for every one

In [None]:
salary_min = data.salary.apply(lambda x: x[0:x.index('-')])
salary_min = salary_min.apply(lambda x: x[1:(x.find('K'))])
salary_max = data.salary.apply(lambda x: x[x.find('-')+1:-1])
salary_max = salary_max.apply(lambda x: x[1:(x.find('K'))])

In [None]:
salary_min = pd.to_numeric(salary_min)
salary_max = pd.to_numeric(salary_max)
salary = (pd.DataFrame(salary_max) + pd.DataFrame(salary_min))/2
data['mean_salary'] = salary.salary.apply(lambda x: int(x))

### See company name column. There is a mistake: ratings are at the end of each line

In [None]:
data.comp_name.head()

### Deletting rating values from company name columns

In [None]:
data['comp_name'] = data['comp_name'].apply(lambda x: x.splitlines()[0])

### Loction column is represented in text form and there is no mistakes, so we don't have to do anything with it.

In [None]:
data.location.unique()

### Headquarters column is represented in text form by sample 'City, State', but there is -1 value

In [None]:
data.headquarters.unique()

### See rows with headquarters with '-1' value

In [None]:
data[data.headquarters=='-1']

### As we can see there is 171 rows with '-1' values in headquarters column. But all these rows have '-1' in rating, company size, founded, ownership type, industry, sector, revenue columns, so I have decided to replace '-1' to 'no information'.

In [None]:
data = data.replace('-1', 'no information')

### See unique values in company size and revenue columns. 'Unknown', 'Unknown / Non-Applicable' and 'no information' mean the same, so I will replace first and second values to 'information'

In [None]:
print('BEFORE REPLACING')
print('Company size uniques')
print(data.comp_size.unique())
print('--------------------')
print('Revenue uniques')
print(data.revenue.unique())
print('--------------------')
data['comp_size'] = data['comp_size'].replace('Unknown', 'no information')
data['revenue'] = data['revenue'].replace('Unknown / Non-Applicable', 'no information')
print('AFTER REPLACING')
print('Company size uniques')
print(data.comp_size.unique())
print('--------------------')
print('Revenue uniques')
print(data.revenue.unique())

### In ownership type, industry and sector columns we have a lot of text data, which doesn't have mistakes

In [None]:
print('Ownership type uniques')
print(data.ownership_type.unique())
print('Industry uniques')
print(data.industry.unique())
print('Sector uniques')
print(data.sector.unique())

### In easy apply and competitors we have values '0K-0K (Glassdoor est.)', which we've got after thirst data preparation step with salary. And in easy_apply column we have changed '-1' to 'no information', so let's check how many such rows we have

In [None]:
print(data.easy_apply.unique())
print('There is {:.2f} % rows with such values in these columns'.format(
    len(data[(data.competitors=='$0K-$0K (Glassdoor est.)')|(data.easy_apply=='no information')|
             (data.easy_apply=='$0K-$0K (Glassdoor est.)')])/len(data)*100))

### There is too much rows with such values in these columns, so let's delete competitors and easy apply columns

In [None]:
data = data.drop(['competitors', 'easy_apply'], axis=1)

### In rating and founded columns we have -1 values,change it to 0.

In [None]:
data = data.replace(-1, 0)

### Data for analysis

In [None]:
data.head()

# Data Analysis

### See how companies name specialists in Data Analysis sphere

In [None]:
px.bar(data.job_title.value_counts().reset_index().head(30), x='index', y='job_title', labels={'index':'job title', 'job_title':'amount of vacancies'},
                                                                                             title = 'Names of vacancies distribution',
      color = 'job_title')

### What salaries can data analysts expect

In [None]:
px.bar(data.salary.value_counts().reset_index(), x='index', y='salary', labels={'index':'salary', 'salary':'amount of vacancies'},
                                                                                             title = 'Salary distribution',
      color = 'salary')

### See min, max and mean salary distribution specialists cn expect

In [None]:
salary_df = pd.DataFrame({'minn':salary_min, 'maxx':salary_max, 'meann':data.mean_salary})
salary_df = salary_df[(salary_df.minn!=0)|(salary_df.maxx!=0)|(salary_df.meann!=0)]
fig = go.Figure()
fig.add_trace(go.Box(y=salary_df['minn'].values, name = 'Min salary boxplot'))
fig.add_trace(go.Box(y=salary_df['maxx'].values, name = 'Max salary boxplot'))
fig.add_trace(go.Box(y=salary_df['meann'].values, name = 'Mean salary boxplot'))

### What sectors need Data Analysts the most?
### The answer is IT and Business Services

In [None]:
px.histogram(data[data.sector!='no information'], x='sector', color='sector', title = 'Amount of vacancies in each sector')

In [None]:
data['state']=data['location'].apply(lambda x: x.split(',')[1])
data['location']=data['location'].apply(lambda x: x.split(',')[0])

### What states need Data Analysts the most?
### The answer is CA, NY and TX

In [None]:
px.histogram(data, x='state', color='state', title = 'Amount of vacancies in each state')

In [None]:
v=pd.DataFrame(data.groupby('revenue').mean_salary.value_counts()).rename(columns={'mean_salary':'amount of vacancies'}).reset_index()
v=v[v.revenue!='no information']
dictionary={'Less than $1 million (USD)':1,'$1 to $5 million (USD)':2,
                                                    '$5 to $10 million (USD)':3,'$10 to $25 million (USD)':4,
                                                    '$25 to $50 million (USD)':5,'$50 to $100 million (USD)':6,
                                                    '$100 to $500 million (USD)':7,'$500 million to $1 billion (USD)':8,
                                                    '$1 to $2 billion (USD)':9,'$2 to $5 billion (USD)':10,'$5 to $10 billion (USD)':11,
                                                    '$10+ billion (USD)':12,'no information':0}
v['rang'] = v['revenue'].map(dictionary)
v=v.sort_values('rang')

In [None]:
px.scatter(v[v.mean_salary!=0], x='mean_salary', y='revenue', size='amount of vacancies', color='mean_salary',
          title = 'Amount of vacancies with each salaries in companies grouped by revenue')

In [None]:
v=pd.DataFrame(data.groupby('comp_size').mean_salary.value_counts()).rename(columns={'mean_salary':'amount of vacancies'}).reset_index()
v=v[v.comp_size!='no information']
dictionary={'1 to 50 employees':1, '51 to 200 employees':2, '201 to 500 employees':3,
                                                        '501 to 1000 employees':4,'1001 to 5000 employees':5, '5001 to 10000 employees':6,
                                                         '10000+ employees':7,'no information':0}
v['rang'] = v['comp_size'].map(dictionary)
v=v.sort_values('rang')

In [None]:
px.scatter(v[v.mean_salary!=0], x='mean_salary', y='comp_size', size='amount of vacancies', color='mean_salary',
           title = 'Amount of vacancies with each mean salarie in companies grouped by company size')

### What sector and industry specilist should choose to get high salary?
### The answer is Education Sector in Education Training Services, Retail in Drug & Health Srores and Manufacturing in Health Care Products Manufacturing
### The lowest salary in Arts, Entertainment & Recreation sector in Audiovisual industry

In [None]:
y=pd.DataFrame(data.groupby(['industry','sector']).mean_salary.mean().reset_index())
px.scatter(y, x='industry', y='sector', color='mean_salary', size='mean_salary')

### Vacancy map
### The larger and more yellow the circle, the better the vacancy

In [None]:
px.scatter(data[data.rating!=0], x='state', y='sector', color='rating', hover_data=['comp_name'], size='mean_salary',
          title = 'Vacancy Map')

### Highlight top vacancies from companies with rating > 4.6 and salary > 68 K (becouse it is median salary from mean_salary column)

In [None]:
top = data[(data.rating>4.6)&(data.mean_salary>69)].sort_values(['mean_salary','rating'],ascending=False )

### Where are top vacancies? In CA

In [None]:
px.histogram(top, x='state', title = 'State distribution for top vacancies', color='state')

### In which industry and sectors top vacancies are? The answer is IT services

In [None]:
px.histogram(top, x='industry', title = 'Industry distribution for top vacancies', color='sector')

In [None]:
px.box(top.mean_salary.values, title = 'Salary description for top vacancies')

### Finding vacancies for juniors

In [None]:
junior = data[(data['job_title'].str.contains('junior'))|(data['job_title'].str.contains('Junior'))]
junior.head(10)

### Salary description for junior analysts

In [None]:
px.box(junior, junior.mean_salary.values, hover_data=['comp_name'], title = 'Junior Analysts salary')

### Vacancy map for juniors: the larger and more yellow the circle, the better the vacancy.
### The majority of vacancies are from good companies with rating 5.

In [None]:
px.scatter(junior, x='state', y='sector', color='rating', hover_data=['comp_name'], size='mean_salary',
          title = 'Vacancy Map for Juniors')

### What states need juniors and what salary they are ready to pay

In [None]:
px.histogram(junior, x='state', color='mean_salary')

### Hey, thank you for watching this notebook ♥ Please upvote if you like it