# FILE 1 of 4

In [1]:
import requests
import json
import pprint
import pandas as pd

# ---Github Jobs API Call---

In [2]:
#endpoont parameters
description = "data science"
location = "usa"
full_time = "true"

#construct endpoint url
url = f"https://jobs.github.com/positions.json?description={description}&full_time={full_time}&location={location}"

#run request and store json response as variable "postings"
postings = requests.get(url).json()

#convert json response into pandas dataframe:
df = pd.DataFrame(postings)
df.head()

Unnamed: 0,id,type,url,created_at,company,company_url,location,title,description,how_to_apply,company_logo
0,e067090a-b8b7-40ff-a06f-8b90e90c75a8,Full Time,https://jobs.github.com/positions/e067090a-b8b...,Thu Oct 22 14:04:51 UTC 2020,Contractor Nation,http://contractornation.com,"Seymour, CT",Software Developer,"<p>Contractor Nation in Seymour, Connecticut i...",<p>Please answer the following questions:</p>\...,
1,d5f0dc69-a363-4fd1-83ea-a28ee40ed811,Full Time,https://jobs.github.com/positions/d5f0dc69-a36...,Tue Oct 20 20:23:24 UTC 2020,West Nyack - THQ,http:,"West Nyack, New York",SQL Server Developer,<p>Overview\nThe Salvation Army exists to meet...,<p>Apply Online</p>\n,
2,18853f8f-4606-46c0-9aaa-f1c9f80eb21e,Full Time,https://jobs.github.com/positions/18853f8f-460...,Tue Oct 20 18:06:33 UTC 2020,The Barnes Foundation,,"Philadelphia, PA",Lead Web Developer,<p>Description: The Lead Web Developer will ar...,"<p><a href=""https://workforcenow.adp.com/mascs...",
3,8a2e3d35-65ba-411d-9952-2e1940b73a4a,Full Time,https://jobs.github.com/positions/8a2e3d35-65b...,Mon Oct 19 19:43:52 UTC 2020,"QSC, LLC",http://www.qsc.com,"Boulder, CO",Senior Software Engineer,<p>QSC is a world leader in the design and man...,<p>Please apply via our careers site: <a href=...,https://jobs.github.com/rails/active_storage/b...
4,f5c836ce-2683-450d-9cfb-5bfc9043bf30,Full Time,https://jobs.github.com/positions/f5c836ce-268...,Mon Oct 19 19:30:14 UTC 2020,"QSC, LLC",http://www.qsc.com,"Costa Mesa, CA",Systems Network Engineer,<p>Who we are:</p>\n<p>QSC thrives where innov...,<p>Please apply via our careers site: <a href=...,https://jobs.github.com/rails/active_storage/b...


# ---Data Cleanup---


## Split Location into City and State Columns

In [3]:
#create dataframe that splits the location string using comma as the delineator
location_df = df['location'].apply(lambda x: pd.Series(x.split(',')))
location_df.head()

Unnamed: 0,0,1,2
0,Seymour,CT,
1,West Nyack,New York,
2,Philadelphia,PA,
3,Boulder,CO,
4,Costa Mesa,CA,


In [4]:
#add columns for city and state to the main dataframe, using the values in the above dataframe
df["split_city"] = location_df[0]
df["split_state"] = location_df[1]
df.head()

Unnamed: 0,id,type,url,created_at,company,company_url,location,title,description,how_to_apply,company_logo,split_city,split_state
0,e067090a-b8b7-40ff-a06f-8b90e90c75a8,Full Time,https://jobs.github.com/positions/e067090a-b8b...,Thu Oct 22 14:04:51 UTC 2020,Contractor Nation,http://contractornation.com,"Seymour, CT",Software Developer,"<p>Contractor Nation in Seymour, Connecticut i...",<p>Please answer the following questions:</p>\...,,Seymour,CT
1,d5f0dc69-a363-4fd1-83ea-a28ee40ed811,Full Time,https://jobs.github.com/positions/d5f0dc69-a36...,Tue Oct 20 20:23:24 UTC 2020,West Nyack - THQ,http:,"West Nyack, New York",SQL Server Developer,<p>Overview\nThe Salvation Army exists to meet...,<p>Apply Online</p>\n,,West Nyack,New York
2,18853f8f-4606-46c0-9aaa-f1c9f80eb21e,Full Time,https://jobs.github.com/positions/18853f8f-460...,Tue Oct 20 18:06:33 UTC 2020,The Barnes Foundation,,"Philadelphia, PA",Lead Web Developer,<p>Description: The Lead Web Developer will ar...,"<p><a href=""https://workforcenow.adp.com/mascs...",,Philadelphia,PA
3,8a2e3d35-65ba-411d-9952-2e1940b73a4a,Full Time,https://jobs.github.com/positions/8a2e3d35-65b...,Mon Oct 19 19:43:52 UTC 2020,"QSC, LLC",http://www.qsc.com,"Boulder, CO",Senior Software Engineer,<p>QSC is a world leader in the design and man...,<p>Please apply via our careers site: <a href=...,https://jobs.github.com/rails/active_storage/b...,Boulder,CO
4,f5c836ce-2683-450d-9cfb-5bfc9043bf30,Full Time,https://jobs.github.com/positions/f5c836ce-268...,Mon Oct 19 19:30:14 UTC 2020,"QSC, LLC",http://www.qsc.com,"Costa Mesa, CA",Systems Network Engineer,<p>Who we are:</p>\n<p>QSC thrives where innov...,<p>Please apply via our careers site: <a href=...,https://jobs.github.com/rails/active_storage/b...,Costa Mesa,CA


## Clean up the creation time to just show the date

In [5]:
#create dataframe that splits the created_at string using space as the delineator
created_df = df['created_at'].apply(lambda x: pd.Series(x.split(' ')))
created_df.head()

Unnamed: 0,0,1,2,3,4,5
0,Thu,Oct,22,14:04:51,UTC,2020
1,Tue,Oct,20,20:23:24,UTC,2020
2,Tue,Oct,20,18:06:33,UTC,2020
3,Mon,Oct,19,19:43:52,UTC,2020
4,Mon,Oct,19,19:30:14,UTC,2020


In [6]:
#add column for creation date to the main dataframe, using the month and day values in the above dataframe
df["creation_date"] = created_df[1] + " " + created_df[2]
df.head()

Unnamed: 0,id,type,url,created_at,company,company_url,location,title,description,how_to_apply,company_logo,split_city,split_state,creation_date
0,e067090a-b8b7-40ff-a06f-8b90e90c75a8,Full Time,https://jobs.github.com/positions/e067090a-b8b...,Thu Oct 22 14:04:51 UTC 2020,Contractor Nation,http://contractornation.com,"Seymour, CT",Software Developer,"<p>Contractor Nation in Seymour, Connecticut i...",<p>Please answer the following questions:</p>\...,,Seymour,CT,Oct 22
1,d5f0dc69-a363-4fd1-83ea-a28ee40ed811,Full Time,https://jobs.github.com/positions/d5f0dc69-a36...,Tue Oct 20 20:23:24 UTC 2020,West Nyack - THQ,http:,"West Nyack, New York",SQL Server Developer,<p>Overview\nThe Salvation Army exists to meet...,<p>Apply Online</p>\n,,West Nyack,New York,Oct 20
2,18853f8f-4606-46c0-9aaa-f1c9f80eb21e,Full Time,https://jobs.github.com/positions/18853f8f-460...,Tue Oct 20 18:06:33 UTC 2020,The Barnes Foundation,,"Philadelphia, PA",Lead Web Developer,<p>Description: The Lead Web Developer will ar...,"<p><a href=""https://workforcenow.adp.com/mascs...",,Philadelphia,PA,Oct 20
3,8a2e3d35-65ba-411d-9952-2e1940b73a4a,Full Time,https://jobs.github.com/positions/8a2e3d35-65b...,Mon Oct 19 19:43:52 UTC 2020,"QSC, LLC",http://www.qsc.com,"Boulder, CO",Senior Software Engineer,<p>QSC is a world leader in the design and man...,<p>Please apply via our careers site: <a href=...,https://jobs.github.com/rails/active_storage/b...,Boulder,CO,Oct 19
4,f5c836ce-2683-450d-9cfb-5bfc9043bf30,Full Time,https://jobs.github.com/positions/f5c836ce-268...,Mon Oct 19 19:30:14 UTC 2020,"QSC, LLC",http://www.qsc.com,"Costa Mesa, CA",Systems Network Engineer,<p>Who we are:</p>\n<p>QSC thrives where innov...,<p>Please apply via our careers site: <a href=...,https://jobs.github.com/rails/active_storage/b...,Costa Mesa,CA,Oct 19


## Remove html tags from job description

In [7]:
#imported code from internet for a function that will strip html tags from a string
#=============================================
from html.parser import HTMLParser
from io import StringIO

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()
#=============================================

In [8]:
#loop through the dataframe rows and run the description through the strip_tags function
#also replace any \n's with blank spaces
for index, row in df.iterrows():
    row["description"] = strip_tags(row["description"])
    row["description"] = row["description"].replace("\n"," ")

## Create list of cities to be used for weather API query

In [9]:
cities_list = df["split_city"].to_list()
cities_list = list(dict.fromkeys(cities_list))
cities_list

['Seymour',
 'West Nyack',
 'Philadelphia',
 'Boulder',
 'Costa Mesa',
 'US San Francisco',
 'Davis',
 'Prairie du Sac',
 'Mankato',
 'Fresno',
 'Fresno',
 'Fort Lauderdale']

## Create clean dataframe of desired data

In [10]:
clean_df = df[["creation_date","title","company","location","split_city","description","url"]]
clean_df.head()

Unnamed: 0,creation_date,title,company,location,split_city,description,url
0,Oct 22,Software Developer,Contractor Nation,"Seymour, CT",Seymour,"Contractor Nation in Seymour, Connecticut is s...",https://jobs.github.com/positions/e067090a-b8b...
1,Oct 20,SQL Server Developer,West Nyack - THQ,"West Nyack, New York",West Nyack,Overview The Salvation Army exists to meet hum...,https://jobs.github.com/positions/d5f0dc69-a36...
2,Oct 20,Lead Web Developer,The Barnes Foundation,"Philadelphia, PA",Philadelphia,Description: The Lead Web Developer will archi...,https://jobs.github.com/positions/18853f8f-460...
3,Oct 19,Senior Software Engineer,"QSC, LLC","Boulder, CO",Boulder,QSC is a world leader in the design and manufa...,https://jobs.github.com/positions/8a2e3d35-65b...
4,Oct 19,Systems Network Engineer,"QSC, LLC","Costa Mesa, CA",Costa Mesa,Who we are: QSC thrives where innovative techn...,https://jobs.github.com/positions/f5c836ce-268...


# ---Data Storing---

## Convert data into mongo friendly dictionary

In [11]:
postings_dict = clean_df.to_dict("records")

In [12]:
postings_dict

[{'creation_date': 'Oct 22',
  'title': 'Software Developer',
  'company': 'Contractor Nation',
  'location': 'Seymour, CT',
  'split_city': 'Seymour',
  'description': 'Contractor Nation in Seymour, Connecticut is seeking a talented in-house software developer to help create revolutionary home improvement apps. As a software developer at Contractor Nation, you will be providing support for our applications, expanding upon existing tools, and creating entirely new software. Candidates will work with a small team on all parts of the development life cycle; including design, implementation, testing, and deployment. Our ideal candidate is motivated, an active learner, and enjoys trying to solve challenging problems. We currently have applications running on PHP, Java, Python, and Node.js, but a deep understanding of any other object-oriented programming language is just as valuable to us. Skills & Requirements BS in Computer Science or equivalent experience Experience with shell and other