# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [2]:
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user"
users = pd.read_csv(url, sep="|", index_col="user_id")
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


### Step 4. Discover what is the mean age per occupation

In [3]:
users.groupby("occupation")["age"].mean()
# OR
pd.pivot_table(users, index="occupation", values="age") # default aggregator is mean()

Unnamed: 0_level_0,age
occupation,Unnamed: 1_level_1
administrator,38.746835
artist,31.392857
doctor,43.571429
educator,42.010526
engineer,36.38806
entertainment,29.222222
executive,38.71875
healthcare,41.5625
homemaker,32.571429
lawyer,36.75


### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [4]:
def ratio(row):
    unique_occupation = users["occupation"].value_counts()
    return round((row/unique_occupation) * 100, 2)

users[users["gender"] == "M"].groupby("occupation").count().apply(ratio)["gender"]
# OR
pd.pivot_table(users[users["gender"] == "M"], 
               index="occupation", 
               values="gender", 
               aggfunc="count").apply(ratio).sort_values(by="gender", ascending=False).rename(columns={"gender":"Male Ratio"})

Unnamed: 0,Male Ratio
doctor,100.0
engineer,97.01
technician,96.3
retired,92.86
programmer,90.91
executive,90.62
scientist,90.32
entertainment,88.89
lawyer,83.33
salesman,75.0


### Step 6. For each occupation, calculate the minimum and maximum ages

In [17]:
users.groupby("occupation")["age"].agg(["min", "max"])
# OR
pd.pivot_table(users, index="occupation", values="age", aggfunc=["min", "max"])

Unnamed: 0_level_0,min,max
Unnamed: 0_level_1,age,age
occupation,Unnamed: 1_level_2,Unnamed: 2_level_2
administrator,21,70
artist,19,48
doctor,28,64
educator,23,63
engineer,22,70
entertainment,15,50
executive,22,69
healthcare,22,62
homemaker,20,50
lawyer,21,53


### Step 7. For each combination of occupation and gender, calculate the mean age

In [7]:
users.groupby(['occupation', 'gender']).age.mean()
# OR
pd.pivot_table(users, 
               index="occupation", 
               columns="gender", 
               values="age", 
               aggfunc="mean").round(0)

gender,F,M
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,41.0,37.0
artist,30.0,32.0
doctor,,44.0
educator,39.0,43.0
engineer,30.0,37.0
entertainment,31.0,29.0
executive,44.0,38.0
healthcare,40.0,45.0
homemaker,34.0,23.0
lawyer,40.0,36.0


### Step 8.  For each occupation present the percentage of women and men

In [75]:
gender_count = users.groupby(["occupation", "gender"]).agg({"gender":"count"})
occupation_count = users.groupby("occupation").count()
occup_gender = gender_count.div(occupation_count, level="occupation") * 100
occup_gender.loc[:, "gender"]
# OR 
pd.pivot_table(users, 
               index="occupation", 
               values="age", 
               columns="gender", 
               aggfunc="count").apply(ratio)

gender,F,M
administrator,45.57,54.43
artist,46.43,53.57
doctor,,100.0
educator,27.37,72.63
engineer,2.99,97.01
entertainment,11.11,88.89
executive,9.38,90.62
healthcare,68.75,31.25
homemaker,85.71,14.29
lawyer,16.67,83.33
