In [22]:
from bs4 import BeautifulSoup # For HTML parsing
import urllib2 # Website connections
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
from collections import Counter

%matplotlib inline

In [2]:
# library for the key words. Stretch goal: build one from the job description using a topic model: LDA.

program_languages=['bash','r','python','java','c++','ruby','perl','matlab','javascript','scala','php','c#']
analysis_software=['excel','tableau','d3.js','sas','spss','d3','saas','pandas','numpy','scipy','sps','spotfire','scikits.learn','splunk','powerpoint','h2o']
bigdata_tool=['hadoop','mapreduce','spark','pig','hive','shark','oozie','zookeeper','flume','mahout']
databases=['sql','nosql','hbase','cassandra','mongodb','mysql','mssql','postgresql','oracle db','rdbms']
overall_dict = program_languages + analysis_software + bigdata_tool + databases


def text_cleaner(text):
    
    lines = (line.strip() for line in text.splitlines()) # break into lines
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
    
    def chunk_space(chunk):
        chunk_out = chunk + ' ' # Need to fix spacing issue
        return chunk_out  
        
    text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line
    # Now clean out all of the unicode junk (this line works great!!!)
        
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception
        
    text = re.sub("[^a-zA-Z.+3#]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)

    text = text.lower().split()  # Go to lower case and split them apart
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    text = [w for w in text if (not w in stop_words)&(w in overall_dict)]
    #text = list(set(text)) 
    
    return text

In [56]:
SF = pd.read_csv('SF_jobs.csv')

In [57]:
skills = []

for i in range(SF.shape[0]):
     skills = skills + text_cleaner(SF.job_description[i])

In [58]:
counts = Counter(skills)


In [59]:
counts

Counter({'bash': 45,
         'c#': 141,
         'c++': 125,
         'cassandra': 122,
         'd3': 23,
         'd3.js': 13,
         'excel': 189,
         'flume': 8,
         'hadoop': 352,
         'hbase': 76,
         'hive': 100,
         'java': 830,
         'javascript': 400,
         'mahout': 7,
         'mapreduce': 45,
         'matlab': 21,
         'mongodb': 57,
         'mssql': 23,
         'mysql': 129,
         'nosql': 200,
         'numpy': 13,
         'oozie': 4,
         'pandas': 6,
         'perl': 114,
         'php': 75,
         'pig': 42,
         'postgresql': 58,
         'powerpoint': 66,
         'python': 533,
         'r': 200,
         'rdbms': 51,
         'ruby': 159,
         'saas': 132,
         'sas': 78,
         'scala': 165,
         'scipy': 3,
         'shark': 2,
         'spark': 231,
         'splunk': 79,
         'spotfire': 15,
         'spss': 9,
         'sql': 959,
         'tableau': 91,
         'zookeeper': 12})