<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>

# Vector Representations
## *Data Science Unit 4 Sprint 2 Assignment 2*

In [1]:
import re
import string
from spacy.tokenizer import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import spacy

from bs4 import BeautifulSoup

## 1) *Clean:* Job Listings from indeed.com that contain the title "Data Scientist" 

You have `job_listings.csv` in the data folder for this module. The text data in the description column is still messy - full of html tags. Use the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) library to clean up this column. You will need to read through the documentation to accomplish this task. 

In [2]:
pwd

'/home/regina/Desktop/DS-Unit-4-Sprint-1-NLP/module2-vector-representations'

In [3]:
# Read in the csv

jobs = pd.read_csv('~/Desktop/DS-Unit-4-Sprint-1-NLP/module2-vector-representations/data/job_listings.csv')
jobs.head()

Unnamed: 0.1,Unnamed: 0,description,title
0,0,"b""<div><div>Job Requirements:</div><ul><li><p>...",Data scientist
1,1,b'<div>Job Description<br/>\n<br/>\n<p>As a Da...,Data Scientist I
2,2,b'<div><p>As a Data Scientist you will be work...,Data Scientist - Entry Level
3,3,"b'<div class=""jobsearch-JobMetadataHeader icl-...",Data Scientist
4,4,b'<ul><li>Location: USA \xe2\x80\x93 multiple ...,Data Scientist


In [4]:
# grab only the description column

descriptions = jobs['description']

In [5]:
jobs.dtypes

Unnamed: 0      int64
description    object
title          object
dtype: object

In [6]:
# just get rid of the HTML

def rm_html(html_string):
    '''
    removes HTML tags in a string
    '''
    soup = BeautifulSoup(html_string, 'html.parser')
    text = soup.get_text() # extract all text out of HTML
    return text


In [7]:
jobs['tokens'] = jobs['description'].apply(rm_html)


In [8]:
jobs['tokens'].head()

0    b"Job Requirements:\nConceptual understanding ...
1    b'Job Description\n\nAs a Data Scientist 1, yo...
2    b'As a Data Scientist you will be working on c...
3    b'$4,969 - $6,756 a monthContractUnder the gen...
4    b'Location: USA \xe2\x80\x93 multiple location...
Name: tokens, dtype: object

In [9]:
# cleaning function

def clean_punct(text):
    text = re.sub('[^a-z ]', ' ', text.lower())
    text = re.sub('\s+', ' ', text)
    # \s = any whitespace character
    # + = one or more of the character before it
    return text

In [10]:
jobs['tokens'] = jobs['tokens'].apply(clean_punct)

In [11]:
jobs['tokens'].head()

0    b job requirements nconceptual understanding i...
1    b job description n nas a data scientist you w...
2    b as a data scientist you will be working on c...
3    b a monthcontractunder the general supervision...
4    b location usa xe x x multiple locations n yea...
Name: tokens, dtype: object

## 2) Use Spacy to tokenize the listings 

In [12]:
# nlp = spacy.load("en_core_web_lg")

# tokens = []

# """ Make them tokens """
# for doc in tokenizer.pipe(jobs['tokens'], batch_size=500):
#     doc_tokens = [token.text for token in doc]
#     tokens.append(doc_tokens)

# jobs['tokens'] = tokens

## 3) Use Scikit-Learn's CountVectorizer to get word counts for each listing.

In [13]:
data = jobs['tokens']

In [14]:
type(data[0])

str

In [16]:
nlp = spacy.load("en_core_web_lg")

In [17]:
'''
Instanciate, fit and transfrom

fit: what we learn the vocab with and what we count against

transform: count the words that we see from what we found when we fit
'''

'''INSTANCIATE'''
vect = CountVectorizer(stop_words=nlp.Defaults.stop_words, max_features=1000)

'''FIT'''
#Learn our Vocab
vect.fit(data)

'''TRANSFROM'''
# Get sparse dtm
dtm = vect.transform(data)

dtm = pd.DataFrame(dtm.todense(), columns=vect.get_feature_names())

  'stop_words.' % sorted(inconsistent))


In [33]:
dtm.head()

Unnamed: 0,abilities,ability,able,academic,access,accommodation,accommodations,accuracy,achieve,acquisition,...,xa,xae,xb,xbb,xc,xe,xef,year,years,york
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0,1,0,0,...,2,0,0,0,0,8,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


## 4) Visualize the most common word counts

In [None]:
most_words = dtm['']

In [None]:
# I dont know how to actually visualize this
# dtm.plot(kind="bar");

## 5) Use Scikit-Learn's tfidfVectorizer to get a TF-IDF feature matrix

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Create a vocabulary and get word counts per document
# Similiar to fit_predict
'''FIT AND TRANSFROM - use on training data'''
dtm = tfidf.fit_transform(data)

# Print word counts

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.head()

Unnamed: 0,aa,ab,abernathy,abilities,ability,able,absence,absolutely,abstract,abundant,...,yes,yeti,york,young,yrs,zenreach,zero,zeus,zf,zillow
0,0.0,0.0,0.0,0.0,0.061108,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.025199,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
def tokenize(document):
    
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [36]:
# Tunning Parameters

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english', 
                        ngram_range=(1,2),
                        max_df=.97, # set the size of the dataset - ingram has to be w/i 0-97 of documents
                        min_df=3,   # sets the size of the dataset - ingram has to be in at least 3 documents
                        max_features = 8000,
                        tokenizer=tokenize)

# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(data) # Similiar to fit_predict

# Print word counts

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.head()

Unnamed: 0,aa,aa employer,ab,abilitie,abilitie nproblem,ability,ability analyze,ability build,ability clearly,ability communicate,...,year n,year professional,year related,year relevant,year work,year xe,yes,york,york city,yrs
0,0.0,0.0,0.0,0.0,0.0,0.049019,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.021023,0.0,0.0,0.0,0.050636,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
dtm.shape

(426, 8000)

## 6) Create a NearestNeighbor Model. Write the description of your ideal datascience job and query your job listings. 

In [38]:
from sklearn.neighbors import NearestNeighbors
# metric = 'minkowski' is a distance metric


# Fit on DTM
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [39]:
# what is close to my first observation
nn.kneighbors([dtm.iloc[0].values])

(array([[0.        , 1.29470268, 1.31105118, 1.31555707, 1.31706638]]),
 array([[  0, 115, 338, 336, 403]]))

In [41]:
# Query Using kneighbors 
nn.kneighbors([dtm.iloc[115]])

(array([[0.        , 1.19725731, 1.19725731, 1.20777773, 1.21394827]]),
 array([[115, 374, 381, 225, 295]]))

In [43]:
data[115][:200]

'b company overview nat proofpoint we have a passion for protecting people data and brands from today xe x x s advanced threats and compliance risks we hire the best people in the business to nbuild an'

In [45]:
datascience_job = [ """Canidate will work from home and be provided snacks. Deep understanding of dataframes and anaylizing them."""]

In [46]:
# Query for Sim of Random doc to BBC
new = tfidf.transform(datascience_job)

In [47]:
nn.kneighbors(new.todense()) # distances are increasing here

(array([[1.28842592, 1.31293595, 1.34008358, 1.34496185, 1.34682814]]),
 array([[200, 244, 384, 173, 110]]))

In [48]:
# Inspect Most relevant result
data[200]

'b about the team nzillow is looking for an extraordinary data scientist to join a growing team zillow is on a mission to give consumers certainty and control when selling their home in particular using our unparalleled data and view into housing markets to help build a world class platform for real estate marketplace optimization in the zillow offers product zillow buys homes directly from interested homeowners thereby sparing them the tremendous stress and effort of selling instead we take that upon ourselves by quickly preparing these homes for the market and sell them to buyers this is a key initiative at zillow as we aim to define the future of real estate this product isn xe x x t viable for everyone yet when that xe x x s the case we want to make sure we still help consumers sell their home nour team is tasked with absorbing dozens sometimes billions of rows of data from various sources organizing them analyzing them and visualizing them to help inform both short and long term d

## Stretch Goals

 - Try different visualizations for words and frequencies - what story do you want to tell with the data?
 - Scrape Job Listings for the job title "Data Analyst". How do these differ from Data Scientist Job Listings
 - Try and identify requirements for experience specific technologies that are asked for in the job listings. How are those distributed among the job listings?
 - Use a clustering algorithm to cluster documents by their most important terms. Do the clusters reveal any common themes?
  - **Hint:** K-means might not be the best algorithm for this. Do a little bit of research to see what might be good for this. Also, remember that algorithms that depend on Euclidean distance break down with high dimensional data.
 - Create a labeled dataset - which jobs will you apply for? Train a model to select the jobs you are most likely to apply for. :) 