In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
FILEPATH = '/kaggle/input/data-analyst-jobs/DataAnalyst.csv'

## Data Extraction

In [None]:
df = pd.read_csv(FILEPATH)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.head()

## Null Visualization

Let's do the visualization on the null values to understand our data better. I am using `missingno` for the null visualization.

In [None]:
df.isnull().any().any()

In [None]:
df.isnull().any()

In [None]:
def get_missing_values(c_df):
    
    values = c_df.isnull().sum()
    percentage = 100 * c_df.isnull().sum() / len(c_df)
    table = pd.concat([values, percentage.round(2)], axis=1)
    table.columns = ['No of missing values', '% of missing values']
    
    return table[table['No of missing values'] != 0].sort_values('% of missing values', ascending = False).style.background_gradient('Greens')

In [None]:
get_missing_values(df)

In [None]:
import missingno as miss

In [None]:
miss.matrix(df)

In [None]:
miss.dendrogram(df)

In [None]:
miss.bar(df)

**Observation:**

* The data is almost not null. We can see only one entry missing in `Company Name`

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum().sum()

In [None]:
df.columns

We will remove `Unnamed: 0` as it is not significant.

In [None]:
# as `Unnamed: 0` doesn't do anything, re can remove them.
df = df.drop(['Unnamed: 0'], axis = 1)

### Understanding Founded column

In [None]:
# Old company
df['Founded'].min()

In [None]:
df['Founded'].drop_duplicates().nsmallest(5).iloc[-1]

In [None]:
df['Founded'].drop_duplicates().nsmallest(10)

In [None]:
df['Founded'].max()

In [None]:
# New company
df['Founded'].drop_duplicates().nlargest(10)

### Clean up Company Name

In [None]:
# We can remove the company name as it is trivial
df.dropna(subset = ["Company Name"], inplace=True)

In [None]:
df_companies = df['Company Name']

In [None]:
df_companies

As the company name comes with rating with newline, we can remove them. 

In [None]:
# remove the rating `\n3.2`
def clean_company_name(company_name):
    
    if('\n' not in company_name):
        return company_name
    
    company_name = company_name[0: company_name.index('\n')]
    
    return company_name

In [None]:
df['Company Name'] = df['Company Name'].apply(clean_company_name)

In [None]:
df['Size'].unique()

In [None]:
df['Type of ownership'].unique()

In [None]:
# replace Type of ownership -1 to Unknown as -1 doesn't change anything
df = df.replace({'Type of ownership': '-1'}, {'Type of ownership': 'Unknown'})

### Clean up Location

In [None]:
df.sample(2)

In [None]:
df['Location_State'] = df['Location'].apply(lambda x: x.split(',')[1].strip())
df['Location_City'] = df['Location'].apply(lambda x: x.split(',')[0].strip())

In [None]:
df.sample(2)

## Plot Time

In [None]:
# How many companies are here

import seaborn as sns
import matplotlib.pyplot as plt


def show_simple_bar_plot(col):

    ax = sns.barplot(
        x = df[col].value_counts().keys(), 
        y = df[col].value_counts().values
    )
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
    plt.show()

In [None]:
show_simple_bar_plot('Type of ownership')

**Observation:**

* We can see that `Company - Private` has more jobs.
* Not sure what to do with `Unknown`. Let's keep it as it is for now. We will fix them later.
* Self-employed is the least in the jobs count.

In [None]:
# replace Size -1 to Unknown as -1 doesn't change anything
df = df.replace({'Size': '-1'}, {'Size': 'Unknown'})

In [None]:
show_simple_bar_plot('Size')

**Observation:**

* Small to medium companies (51 - 200 employees) are more likely to hire data analysts. 
* Big companies (10000+ employees) are in the top second to hire Data analysts.

### Salary Column Cleanup

In [None]:
# clean up salary column
def clean_salary(salary):
    
    if('(' not in salary):
        return salary
    
    salary = salary[0: salary.index('(')]
    
    return salary

In [None]:
df['Salary Estimate'] = df['Salary Estimate'].apply(clean_salary)

As we need the salary minimum and maximum to analyze and visualize better, we are going to create those 2 columns from the `Salary Estimation`.

In [None]:
import re

def get_salary_min(salary):
    
    salary_parts = salary.split('−')
    
    min_salary_part = salary_parts[0].strip()
    
    min_salary = int(re.search(r'\d+', min_salary_part).group())
    
    return min_salary

In [None]:
def get_salary_max(salary):
    
    salary_parts = salary.split('-')
    
#     print(salary_parts)
    
    max_salary_part = salary_parts[1].strip()
    
    max_salary = int(re.search(r'\d+', max_salary_part).group())
    
    return max_salary

In [None]:
df['Salary Min'] = df['Salary Estimate'].apply(get_salary_min)
df['Salary Max'] = df['Salary Estimate'].apply(get_salary_max)

In [None]:
# Top 5 salaries
df['Salary Max'].drop_duplicates().nlargest(10)

In [None]:
# Leasst 5 salaries
df['Salary Min'].drop_duplicates().nsmallest(10)

### Clean Easy Apply Column

In [None]:
# Easy apply fix
df['Easy Apply'].unique()

In [None]:
df = df.replace({'Easy Apply': 'True'}, {'Easy Apply': 1})
df = df.replace({'Easy Apply': '-1'}, {'Easy Apply': 0})

## Plotly Time

Let's play around with Plotly to visualize things better.

In [None]:
# show top industries for min, max salary

import plotly.express as px
import plotly.graph_objects as go

df_ownership = df.groupby('Location')[['Salary Max','Salary Min']].mean().sort_values(['Salary Max','Salary Min'], ascending = True).head(20)

fig = go.Figure()

fig.add_trace(go.Bar(x = df_ownership.index, y = df_ownership['Salary Min'], name = 'Min Salary', marker = dict(color = 'green')))
fig.add_trace(go.Bar(x = df_ownership.index, y = df_ownership['Salary Max'], name = 'Max Salary', marker = dict(color = 'orange')))

fig.update_layout(title = 'Top Industries and theri Salary Range', barmode = 'stack')

fig.show()

**Observation:**

* Newark tops the list with the maximum salary range followed by Daly city California.
* Stanford and San Francisco - California has the least in the salary category.

In [None]:
fig = px.scatter(df, x = "Salary Min", y = "Salary Max",
                 color = "Salary Max", color_continuous_scale = 'Inferno')

fig.show()

**Observation:**

* The above diagram shows the salary range very clearly.
* There are some jobs which you get paid only 24K (wondering who would apply for thos jobs?).
* Maximum salary is 190 which is very alluring.

In [None]:
df_salary_min_max = df[['Salary Max','Salary Min']]
# df_salary_min_max


fig = px.histogram(df_salary_min_max, x = "Salary Min")
fig.show()

In [None]:
df.sample(2)

In [None]:
# Which user collected the most?
state_df = pd.DataFrame(df['Location_State'].value_counts().head(10)).reset_index()

state_df.style.background_gradient(cmap='YlGnBu', low=0, high=0, axis=0, subset=None)

In [None]:
state_fig = go.Figure(data=[go.Pie(labels=state_df['index'],
                             values=state_df['Location_State'],
                             hole=.7,
                             title = 'Count by State',
                             marker_colors = px.colors.sequential.Blues_r,
                            )
                     ])
state_fig.update_layout(title = 'Job Count % by State')
state_fig.show()

In [None]:
# Which user collected the most?
city_df = pd.DataFrame(df['Location_City'].value_counts().head(10)).reset_index()

city_df.style.background_gradient(cmap='YlGnBu', low=0, high=0, axis=0, subset=None)

In [None]:
city_fig = go.Figure(data=[go.Pie(labels=city_df['index'],
                             values=city_df['Location_City'],
                             hole=.7,
                             title = 'Count % by City',
                             marker_colors = px.colors.sequential.Blues_r,
                            )
                     ])
city_fig.update_layout(title = 'Job Count % by Location City')
city_fig.show()

In [None]:
df.sample(2)

## Job Desc - Word Cloud

Let's create a word cloud out of job description in the dataset.

In [None]:
job_desc = ', '.join(df['Job Description'])

In [None]:
# Make a word cloud out of Job description

from wordcloud import WordCloud, STOPWORDS
from PIL import Image

stopwords = set(STOPWORDS)

def transform_format(val):
    if val == 0:
        return 255
    else:
        return val

mask_image = np.array(Image.open("../input/pictures2/man.png"))

transformed_mask_image = np.ndarray((mask_image.shape[0], mask_image.shape[1]), np.int32)

for i in range(len(mask_image)):
    transformed_mask_image[i] = list(map(transform_format, mask_image[i]))

wc = WordCloud(background_color = "white", max_words = 1000, mask = transformed_mask_image,
               stopwords = stopwords, contour_width = 0, contour_color = 'black')

wc.generate(job_desc)

# show
plt.figure(figsize = [30, 20])
plt.imshow(wc, interpolation = 'bilinear')
plt.axis("off")
plt.show()

**Observation:**

* The word cloud turned out nice!
* Every one need `Experience` in the job application huh? Don't they accept Kaggle notebook experience?

**Final Notes:**

I am adding things still. You can come back and check for more information.

Also, if you **like my notebook**, <font style="color:blue;size:14px;">please upvote it</font> as it will motivate me to come up with better approach in the upcoming notebooks.

<font color="blue" size=+1.5><b>Check out my other kernels</b></font>

<table style="font-family: 'Trebuchet MS', Arial, Helvetica, sans-serif;border-collapse: collapse;width: 100%;">
  <tr>
    <th style="border: 1px solid #ddd;padding: 8px; padding-top: 12px;padding-bottom: 12px;text-align: left;background-color: #2987E7;color: white;">Notebook</th>
    <th style="border: 1px solid #ddd;padding: 8px; padding-top: 12px;padding-bottom: 12px;text-align: left;background-color: #2987E7;color: white;">Tags</th>
  </tr>
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/sof-questions-eda-and-visual">SOF Questions - EDA and Visual</a> </td>
    <td style="text-align: left">Data Visual, Plotly</td>
  </tr>
  <tr>
    <td style="background-color: #f2f2f2;text-align: left"><a href="https://www.kaggle.com/rajacsp/netflix-visualization-plotly-plots-treemap">Netflix - Visualization, Plotly, Plots, and Treemap</a> </td>
    <td style="background-color: #f2f2f2;text-align: left">Data Visual, Data Cleaning, Plotly</td>
  </tr>
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/prediction-with-various-algorithms">Prediction with various Algorithms</a> </td>
    <td style="text-align: left">Random Forest, Logistic Regression</td>
  </tr>
  <tr>
    <td style="background-color: #f2f2f2;text-align: left"><a href="https://www.kaggle.com/rajacsp/eda-and-visualization">EDA and Visualization</a> </td>
    <td style="background-color: #f2f2f2;text-align: left">Data Cleaning, Data Visual</td>
  </tr>
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/job-analysis-eda-visual">Job Analysis - EDA and Visual</a> </td>
    <td style="text-align: left">Data Visual, EDA, Plotly</td>
  </tr>   
  <tr>
    <td style="background-color: #f2f2f2;text-align: left"><a href="https://www.kaggle.com/rajacsp/estonia-disaster-visualization">Estonia Disaster - Visualization</a> </td>
    <td style="background-color: #f2f2f2;text-align: left">Data Visual, EDA, Data Cleaning</td>
  </tr>
    
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/pandas-dundas-challenge-100" >Pandas 100+ exercises collection</a></td>
    <td style="text-align: left">Pandas, Data Manipulation</td>
  </tr>   
  <tr>
    <td style="background-color: #f2f2f2;text-align: left"><a href="https://www.kaggle.com/rajacsp/prediction-with-various-algorithms">Credit Card Fraud - Prediction with various algorithms</a></td>
    <td style="background-color: #f2f2f2;text-align: left">Various ML Algorithms</td>
  </tr>  
  <tr>
    <td style="text-align: left"><a href="https://www.kaggle.com/rajacsp/linear-equations-real-time">Linear Equations - Real Time</a> </td>
    <td style="text-align: left">Linear Equation</td>
  </tr>  
</table>
