In [1]:
# access passwords
import os
from dotenv import load_dotenv
# data processing
import pandas as pd
import numpy as np
# accessing the db
import pymongo

In [2]:
load_dotenv()
MONGO_IP = os.getenv('MONGO_IP')
MONGO_PORT = int(os.getenv('MONGO_PORT'))
MONGO_DATABASE = os.getenv('MONGO_DATABASE')
MONGO_COLLECTION = 'stackOverflow'

In [3]:
# taken from
#https://stackoverflow.com/questions/16249736/how-to-import-data-from-mongodb-to-pandas
def _connect_mongo(host, port, db=None, username=None, password=None):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = pymongo.MongoClient(mongo_uri)
    else:
        conn = pymongo.MongoClient(host, port)

    return conn


def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=False):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    conn = _connect_mongo(host=host, port=port, username=username, password=password, db=db)
    db = conn[db]
    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(iter(cursor))

    # Delete the _id
    if no_id and '_id' in df:
        del df['_id']

    return df

def update_mongo(db, collection,df, host='localhost', port=27017, username=None, password=None, no_id=False):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    conn = _connect_mongo(host=host, port=port, username=username, password=password, db=db)
    db = conn[db]
    # Make a query to the specific DB and Collection
    updates = []

    for _, row in df.iterrows():
        updates.append(pymongo.UpdateOne({'_id': row.get('_id')}, {'$set': {'cleanedDescription': row.get('description')}}, upsert=True))

    db[collection].bulk_write(updates)

In [4]:
query = {"site": "StackOverflow"}

In [5]:
df = read_mongo(MONGO_DATABASE,MONGO_COLLECTION,query)

In [6]:
df.head()

Unnamed: 0,_id,positionName,site,remote,postedWhen,company,url,location,description,applied,time,cleanedDescription
0,60272fa3db6b8ba99ea7e747,"Software Engineer, Backend (Remote)",StackOverflow,True,Posted 3 days ago,"Stafford, TX",https://stackoverflow.com/jobs/500477/software...,"Stafford, TX.",[['\r\nHims and Hers is seeking a Software Eng...,False,"02/12/2021, 17",[['\r\nHims and Hers is seeking a Software Eng...
1,60272fa5db6b8ba99ea7e748,"Software Engineer, Full Stack",StackOverflow,True,Posted 4 days ago,"Vancouver, BC, Canada",https://stackoverflow.com/jobs/500205/software...,"On-site by default, exceptions based on indivi...","[[""\r\nOur customers rely on Alloy for tasks r...",False,"02/12/2021, 17","[[""\r\nOur customers rely on Alloy for tasks r..."
2,60272fa6db6b8ba99ea7e749,Full-Stack Software Engineer,StackOverflow,True,Posted 14 days ago,"Hays, KS",https://stackoverflow.com/jobs/471179/full-sta...,(GMT-06:00) Central Time +/- 2 hours,[['\r\nThis position will contribute to a prod...,False,"02/12/2021, 17",[['\r\nThis position will contribute to a prod...
3,60272fa6db6b8ba99ea7e74a,Python Software Engineer,StackOverflow,True,Posted < 1 hour ago,"New York, NY",https://stackoverflow.com/jobs/502021/python-s...,"New York, NY.",[['\r\nWe are looking for a humble positive so...,False,"02/12/2021, 17",[['\r\nWe are looking for a humble positive so...
4,60272fa6db6b8ba99ea7e74b,React Native Engineer at sustainable mobility ...,StackOverflow,False,Posted 3 days ago,"München, Deutschland",https://stackoverflow.com/jobs/485085/react-na...,,"[[""\r\nJoin finn.auto to make car ownership fu...",False,"02/12/2021, 17","[[""\r\nJoin finn.auto to make car ownership fu..."


In [7]:
df['description'][0]

["['\\r\\nHims and Hers is seeking a Software Engineer to help build a fast, reliable, and frictionless telemedicine experience.\\nResponsibilities:\\xa0\\n\\nBuild backend services that power our telehealth platform.\\nBrainstorm features with product managers and designers and guide decisions based on your knowledge of the codebase.\\nReview code and have your code reviewed.\\nMentor and be mentored by other engineers.\\nActively participate in retrospectives and help drive continuous improvement in our process and culture\\n            ']",
 "['\\r\\nYou may be a good fit if you...\\xa0\\n\\nHave 2+ years as a software engineer, shipping production code.\\nAre proficient at writing distributed services using Java, Kotlin, and/or PostgreSQL.\\n\\nWe are focused on building a diverse and inclusive workforce. If you’re excited about this role, but do not meet 100% of the qualifications listed above, we encourage you to apply.\\nHims is an Equal Opportunity Employer and considers applic

In [8]:
from collections import Counter
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [9]:
def htmlGarbageDeleter(text):
    """ regex to delete pesky \r \n"""
    regex =r'(\\r).*(\\n)|(\\xa0)' 
    words = text.lower().split()
    cleanText = [re.sub(regex, '', w) for w in words]
    return ' '.join(cleanText)

In [10]:
def nltkPreprocess(text):
    # yeah it's not very good because the [] gets captured but since they are later captured by string.punctuation there's no harm
    text = htmlGarbageDeleter(text)
    # lower
    words = text.lower().split()
    # remove punctiuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in words]
    #remove stopwords
    stop_words = set(stopwords.words('english'))
    clean = [w for w in stripped if not w in stop_words if w!='' and w.isnumeric()==False and len(w) >2] #len >2 to filter garbage
    # we concatenate all list elements
    return ' '.join(clean)

In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
df['description'] = df['description'].apply(lambda x:nltkPreprocess(str(x)))

In [13]:
df['description'][2]

'position contribute product wide growing range features we’re looking someone passionate learning energized prospect working earlystage startupour codebase primarily javascript reactredux frontend nodejs api small team agile flexible we’re interested people contribute multiple platforms process tremendous amount financial data million records average workday utilize variety backend technologies accomplish thisat unhedged expect tocontribute clean scalable spa using react modern build toolchaindesign deliver fullfeatured mobile application using react nativedevelop maintain backend processes using node api well various supporting databases services interfacessolve challenging problems involving huge quantities data tremendous throughputgain exposure wide range frontend backend technologies supporting demanding particular customer base2 years professional javascript experience required knowledge react native redux kubernetes testing frameworks cypress plus'

In [17]:
update_mongo(MONGO_DATABASE,MONGO_COLLECTION,df)

In [15]:
def test(text):
    """haven't found a better name for the function yet"""
    tokens = [t for t in text.split()]
    freq = nltk.FreqDist(tokens)
    # for key,val in freq.items():
    #     print(str(key) + ':' + str(val))
    freq.plot(20, cumulative=False)