In [1]:
import pandas as pd
from gensim import corpora, models
from gensim.similarities import MatrixSimilarity
from gensim.utils import SaveLoad
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re 
import pyLDAvis.gensim
from collections import Counter
from gensim.matutils import corpus2csc, sparse2full, corpus2dense
import gensim

In [2]:
SF = pd.read_csv('SF_jobs_new.csv')
SF.drop('Unnamed: 0',axis=1,inplace=True)
NYC = pd.read_csv('NYC_jobs_new.csv')
NYC.drop('Unnamed: 0',axis=1,inplace=True)
Seattle = pd.read_csv('Seattle_jobs_new.csv')
Seattle.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
def Preprocessing_line(line): # perform tokenization, select noun, Lemmertization etc on a line text
    rtext=[]    
    for w, tag in nltk.pos_tag(nltk.word_tokenize(line.lower())):  # Tokenization
        if tag in ['NN']:  # Keep only Nouns (for this special case)
            rtext.append(w)
    
    wordnet_lemmatizer = WordNetLemmatizer()  # Lemmertization
    rtext = [wordnet_lemmatizer.lemmatize(w) for w in rtext]
            
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    rtext = [w for w in rtext if not w in stop_words]
    
    return rtext # return list of tokens

def text_cleaner(html,Oneline=True): # return a single line of cleaned text

    soup_obj = BeautifulSoup(html,"lxml") # Get the html from the site

    for script in soup_obj(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object
    
    text = soup_obj.body.get_text('\n') # Get the text from this

    lines = (line.strip() for line in text.splitlines()) # break into lines
#    lines = [line for i, line in enumerate(lines) if i>16]
#    for line in lines:
#        print line
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
    
    def chunk_space(chunk):
        chunk_out = chunk + ' ' # Need to fix spacing issue
        return chunk_out  

    text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line

    # Now clean out all of the unicode junk (this line works great!!!)
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception

    text = re.sub("[^a-zA-Z+3#]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
      
    text = Preprocessing_line(text)
    
    return text

In [5]:
cleanDescription = []
for i in range(SF.shape[0]):
    cleanDescription.append(text_cleaner(SF.job_description[i]))
for i in range(NYC.shape[0]):
    cleanDescription.append(text_cleaner(NYC.job_description[i]))
for i in range(Seattle.shape[0]):
    cleanDescription.append(text_cleaner(Seattle.job_description[i]))

In [2]:
cleaneddoc = pd.read_pickle('cleaned_doc.txt')
dictionary = gensim.corpora.Dictionary(cleaneddoc)
corpus = [dictionary.doc2bow(text) for text in cleaneddoc]
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]


new_texts=[]
for i, doc in enumerate(corpus_tfidf):
    text = []
    for j, (_,tfidf) in enumerate(doc):
        if tfidf >= 0.25:
            text.append(cleaneddoc[i][j])
    new_texts.append(text)
   
cleanDescription = new_texts

In [10]:
def makeDict(myTweetList):
    """Create dictionary from list of tokenized documents"""
    return corpora.Dictionary(myTweetList)

def makeCorpus(myTweetList,myDict):
    """Create corpus from list of tokenized documents"""
    return [myDict.doc2bow(tweet) for tweet in myTweetList]

def createLDA(myCorpus, myDictionary,myTopics=10,myPasses=10,myIterations=50,myAlpha=0.001):
    """LDA model call function"""
    return models.LdaMulticore(myCorpus, id2word=myDictionary, num_topics=myTopics, passes=myPasses,
    iterations=myIterations,alpha=myAlpha)
    #return models.ldamodel.LdaModel(myCorpus, id2word=myDictionary, num_topics=myTopics, update_every=1, chunksize=100, passes=1,gamma_threshold=0.001, minimum_probability=0.01, alpha = 'asymmetric')

In [11]:
kagDict   = makeDict(cleanDescription)
kagCorpus = makeCorpus(cleanDescription, kagDict)
kagLda = createLDA(kagCorpus, kagDict)

In [12]:
pyLDAvis.enable_notebook()
ldaViz = pyLDAvis.gensim.prepare(kagLda, kagCorpus, kagDict)
ldaViz