# Data Wrangling Template

## Gather

In [1]:
import zipfile
import pandas as pd

In [2]:
with zipfile.ZipFile('armenian-online-job-postings.zip', 'r') as myzip:
    myzip.extractall()

In [3]:
#read csv
df = pd.read_csv('online-job-postings.csv')

## Assess

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19001 entries, 0 to 19000
Data columns (total 24 columns):
jobpost             19001 non-null object
date                19001 non-null object
Title               18973 non-null object
Company             18994 non-null object
AnnouncementCode    1208 non-null object
Term                7676 non-null object
Eligibility         4930 non-null object
Audience            640 non-null object
StartDate           9675 non-null object
Duration            10798 non-null object
Location            18969 non-null object
JobDescription      15109 non-null object
JobRequirment       16479 non-null object
RequiredQual        18517 non-null object
Salary              9622 non-null object
ApplicationP        18941 non-null object
OpeningDate         18295 non-null object
Deadline            18936 non-null object
Notes               2211 non-null object
AboutC              12470 non-null object
Attach              1559 non-null object
Year              

- Nondescriptive column headers (ApplicationP, AboutC, RequiredQual)

In [5]:
# Display the first five rows of the DataFrame using .head
df.head()

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\r\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\r\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\r\...,,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\r\n...,,23 January 2004\r\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"20 January 2004, 18:00",,,,2004,1,True


- Missing values(NaN)
- Making one way to express As soon as possible
- StartDate inconsistencies

In [6]:
# Display the last five rows of the DataFrame using .tail
df.tail()

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
18996,Technolinguistics NGO\r\n\r\n\r\nTITLE: Senio...,"Dec 28, 2015",Senior Creative UX/ UI Designer,Technolinguistics NGO,,Full-time,,,,Long-term,...,Competitive,"To apply for this position, please send your\r...",29 December 2015,28 January 2016,,As a company Technolinguistics has a mandate t...,,2015,12,False
18997,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 30, 2015",Category Development Manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",,Full-time,All interested professionals.,,ASAP,Long-term with a probation period of 3 months.,...,,All interested candidates are kindly requested...,30 December 2015,20 January 2016,,,,2015,12,False
18998,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 30, 2015",Operational Marketing Manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",,Full-time,All interested professionals.,,ASAP,Long-term with a probation period of 3 months.,...,,All interested candidates are kindly requested...,30 December 2015,20 January 2016,,,,2015,12,False
18999,San Lazzaro LLC\r\n\r\n\r\nTITLE: Head of O...,"Dec 30, 2015",Head of Online Sales Department,San Lazzaro LLC,,,,,,Long-term,...,Highly competitive,Interested candidates can send their CVs to:\r...,30 December 2015,29 January 2016,,San Lazzaro LLC works with several internation...,,2015,12,False
19000,"""Kamurj"" UCO CJSC\r\n\r\n\r\nTITLE: Lawyer in...","Dec 30, 2015",Lawyer in Legal Department,"""Kamurj"" UCO CJSC",,Full-time,,,,Indefinite,...,,All qualified applicants are encouraged to\r\n...,30 December 2015,20 January 2016,,"""Kamurj"" UCO CJSC is providing micro and small...",,2015,12,False


In [16]:
# Display the entry counts for the Year column using .value_counts
df['Year'].value_counts()

2012    2149
2015    2009
2013    2009
2014    1983
2008    1785
2011    1697
2007    1538
2010    1511
2009    1191
2005    1138
2006    1116
2004     875
Name: Year, dtype: int64

## Clean

#### Define
- Select all records in the StartDate column that have "As soon as possible", "Immediately", etc. and replace the text in those cells with "ASAP"
- nondescriptive column headers

#### Code

In [23]:
df.StartDate.value_counts()

ASAP                                                                                                                                            4754
Immediately                                                                                                                                      773
As soon as possible                                                                                                                              543
Upon hiring                                                                                                                                      261
Immediate                                                                                                                                        259
Immediate employment                                                                                                                             140
As soon as possible.                                                                                      

In [19]:
df.query('StartDate == "As soon as possible", "Immediately"')

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
242,"""Armenian-American Food Processing Company"" JV...","May 3, 2004",Cheif Accountant,"""Armenian-American Food Processing Company"" JV...",,,,,ASAP,Continuos,...,Competitive offer according to the skills of a...,Please contact us by sending your CVs on our\r...,,31 May 2004.,,We are 2 years old armenian-american joint ven...,,2004,5,False
317,Valetta Ltd\r\nTITLE: Fastfood and Pub Manage...,Jun 4 11:37 PM,Fastfood and Pub Manager,Valetta Ltd,,,,,ASAP,,...,,"To be considered, please e-mail your CV to:VKE...",,11 June 2004,,The company is engaged in wholesale trade of\r...,,2004,6,False
332,Cosmoplast\r\nTITLE: Senior Industrial Automa...,Jun 13 2:02 AM,Senior Industrial Automation Engineer,Cosmoplast,,,,,ASAP,,...,,Please send your CV (resume) tomanouk@...\r\nP...,,15 July 2004,,The company is part of a group of leading manu...,,2004,6,False
416,Valletta Ltd\r\nTITLE: Computer graphic desig...,"Jul 12, 2004",Computer graphic designer,Valletta Ltd,,,,,ASAP,,...,,"To be considered, please e-mail your CV to:web...",,20 July 2004,,The company is engaged in wholesale trade of\r...,,2004,7,True
431,"""Armenian-American Food Processing Company"" JV...","Jul 16, 2004",Advertising Department Manager,"""Armenian-American Food Processing Company"" JV...",,,,,ASAP,Continuous,...,Competitive offer according to the skills of a...,Please e-mail your CVs to: aafpc2002@...\r\nPl...,,20 July 2004,,We are 2 years old armenian-american company e...,,2004,7,False
432,"""Armenian-American Food Processing Company"" JV...","Jul 16, 2004",Chief Accountant,"""Armenian-American Food Processing Company"" JV...",,,,,ASAP,Continuous,...,Competitive offer according to the skills of a...,Please contact us by sending your CVs on our\r...,,30 July 2004,,,,2004,7,False
446,Rasco-Armenia cjsc\r\nTITLE: Executive Direct...,"Jul 21, 2004",Executive Director,Rasco-Armenia cjsc,,,All qualified candidates.,,ASAP,,...,,"Applicants are asked to submit cover letters,\...",,01 August 2004,,,,2004,7,False
447,Rasco-Armenia cjsc\r\nTITLE: Chief Accountant...,"Jul 21, 2004",Chief Accountant,Rasco-Armenia cjsc,,,All qualified candidates.,,ASAP,,...,Competitive remuneration depending on experien...,Applicants are asked to submit cover letters\r...,,01 August 2004,,,,2004,7,False
471,Lycos Europe\r\nTITLE: Java/C++ Developer for...,"Jul 29, 2004",Java/C++ Developer for Lycos Communities,Lycos Europe,,,,,ASAP,Permanent,...,Attractive,Please send your CV to info@....\r\nYou can al...,29 July 2004,31 August 2004,,Lycos Europe is one of the leading European In...,,2004,7,True
472,Lycos Europe\r\nTITLE: JSP/Java Developer for...,"Jul 29, 2004",JSP/Java Developer for Lycos Chat,Lycos Europe,,,,,ASAP,Permanent,...,Attractive,Please send your CV to info@....\r\nYou can al...,29 July 2004,31 August 2004,,Lycos Europe is one of the leading European In...,,2004,7,True


In [22]:
df_clean = df.copy()
df_clean = df_clean.rename(columns = {'ApplicationP' : 'ApplicationProcedure',
                                      'AboutC': 'AboutCompany',
                                      'RequiredQual':'RequiredQualifications',
                                      'JobRequirment':'JobRequirement'})

In [24]:
asap_list = ['Immediately', 'As soon as possible', 'Upon hiring',
             'Immediate', 'Immediate employment', 'As soon as possible.', 'Immediate job opportunity',
             '"Immediate employment, after passing the interview."',
             'ASAP preferred', 'Employment contract signature date',
             'Immediate employment opportunity', 'Immidiately', 'ASA',
             'Asap', '"The position is open immediately but has a flexible start date depending on the candidates earliest availability."',
             'Immediately upon agreement', '20 November 2014 or ASAP',
             'immediately', 'Immediatelly',
             '"Immediately upon selection or no later than November 15, 2009."',
             'Immediate job opening', 'Immediate hiring', 'Upon selection',
             'As soon as practical', 'Immadiate', 'As soon as posible',
             'Immediately with 2 months probation period',
             '12 November 2012 or ASAP', 'Immediate employment after passing the interview',
             'Immediately/ upon agreement', '01 September 2014 or ASAP',
             'Immediately or as per agreement', 'as soon as possible',
             'As soon as Possible', 'in the nearest future', 'immediate',
             '01 April 2014 or ASAP', 'Immidiatly', 'Urgent',
             'Immediate or earliest possible', 'Immediate hire',
             'Earliest  possible', 'ASAP with 3 months probation period.',
             'Immediate employment opportunity.', 'Immediate employment.',
             'Immidietly', 'Imminent', 'September 2014 or ASAP', 'Imediately']


In [27]:

for phrase in asap_list:
    df_clean.replace(to_replace = phrase, value = 'ASAP', inplace = True)
    

In [28]:
df_clean.StartDate.value_counts()

ASAP                                                                                                                                            6856
01 September 2012                                                                                                                                 31
March 2006                                                                                                                                        27
November 2006                                                                                                                                     22
January 2010                                                                                                                                      19
February 2014                                                                                                                                     17
01 February 2005                                                                                          

#### Test

In [29]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19001 entries, 0 to 19000
Data columns (total 24 columns):
jobpost                   19001 non-null object
date                      19001 non-null object
Title                     18973 non-null object
Company                   18994 non-null object
AnnouncementCode          1208 non-null object
Term                      7676 non-null object
Eligibility               4930 non-null object
Audience                  640 non-null object
StartDate                 9675 non-null object
Duration                  10798 non-null object
Location                  18969 non-null object
JobDescription            15109 non-null object
JobRequirement            16479 non-null object
RequiredQualifications    18517 non-null object
Salary                    9622 non-null object
ApplicationProcedure      18941 non-null object
OpeningDate               18295 non-null object
Deadline                  18936 non-null object
Notes                     2211 non

In [30]:
for phrase in asap_list:
    assert phrase not in df_clean.StartDate.values

## Reassess