# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [39]:
import pandas as pd
pd.set_option('display.max_rows', 15)

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [5]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user'
users = pd.read_csv(url, sep='|', index_col='user_id')

In [6]:
users.head(20)

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
6,42,M,executive,98101
7,57,M,administrator,91344
8,36,M,administrator,5201
9,29,M,student,1002
10,53,M,lawyer,90703


### Step 4. Discover what is the mean age per occupation

In [7]:
users.groupby('occupation')['age'].mean().map(round)

occupation
administrator    39
artist           31
doctor           44
educator         42
engineer         36
entertainment    29
executive        39
healthcare       42
homemaker        33
lawyer           37
librarian        40
marketing        38
none             27
other            35
programmer       33
retired          63
salesman         36
scientist        36
student          22
technician       33
writer           36
Name: age, dtype: int64

### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [54]:
# Solution 1 Bcp plus rapide 4.96 ms ± 221 µs
gen_occ = users.groupby(['occupation', 'gender']).count().iloc[:,0]\
                                                 .unstack().fillna(0)
round(gen_occ['M']*100/gen_occ.sum(axis=1),1).sort_values(ascending=False)

occupation
doctor           100.0
engineer          97.0
technician        96.3
retired           92.9
programmer        90.9
executive         90.6
scientist         90.3
                 ...  
writer            57.8
none              55.6
administrator     54.4
artist            53.6
librarian         43.1
healthcare        31.2
homemaker         14.3
Length: 21, dtype: float64

In [56]:
# Solution 2 12.8 ms ± 253 µs
gen_occ = users.pivot_table(index='occupation', columns='gender',\
                  values='age', aggfunc='count').fillna(0)
round(gen_occ['M']*100/gen_occ.sum(axis=1),1).sort_values(ascending=False)

occupation
doctor           100.0
engineer          97.0
technician        96.3
retired           92.9
programmer        90.9
executive         90.6
scientist         90.3
                 ...  
writer            57.8
none              55.6
administrator     54.4
artist            53.6
librarian         43.1
healthcare        31.2
homemaker         14.3
Length: 21, dtype: float64

In [40]:
# Solution 3
_ = pd.get_dummies(users['gender'], 'gender')
for i in range(_.shape[1]):
    users.insert(2,_.columns[i], _[_.columns[i]].values)
users

Unnamed: 0_level_0,age,gender,gender_M,gender_F,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,24,M,1,0,technician,85711
2,53,F,0,1,other,94043
3,23,M,1,0,writer,32067
4,24,M,1,0,technician,43537
5,33,F,0,1,other,15213
6,42,M,1,0,executive,98101
7,57,M,1,0,administrator,91344
...,...,...,...,...,...,...
937,48,M,1,0,educator,98072
938,38,F,0,1,technician,55038


In [55]:
users.groupby(['occupation']).agg({'gender_M': 'sum', 'gender': 'count'})\
     .apply(lambda x: round(x['gender_M']*100/x['gender'],1), axis=1)\
     .sort_values(ascending=False)


occupation
doctor           100.0
engineer          97.0
technician        96.3
retired           92.9
programmer        90.9
executive         90.6
scientist         90.3
                 ...  
writer            57.8
none              55.6
administrator     54.4
artist            53.6
librarian         43.1
healthcare        31.2
homemaker         14.3
Length: 21, dtype: float64

### Step 6. For each occupation, calculate the minimum and maximum ages

In [35]:
users.groupby(['occupation']).agg({'age':[min,max]})

Unnamed: 0_level_0,age,age
Unnamed: 0_level_1,min,max
occupation,Unnamed: 1_level_2,Unnamed: 2_level_2
administrator,21,70
artist,19,48
doctor,28,64
educator,23,63
engineer,22,70
entertainment,15,50
executive,22,69
healthcare,22,62
homemaker,20,50
lawyer,21,53


### Step 7. For each combination of occupation and gender, calculate the mean age

In [84]:
occ_gen_av_age = users.groupby(['occupation','gender']).agg({'age':'mean'}).unstack('gender').fillna(0)
occ_gen_av_age

Unnamed: 0_level_0,age,age
gender,F,M
occupation,Unnamed: 1_level_2,Unnamed: 2_level_2
administrator,40.638889,37.162791
artist,30.307692,32.333333
doctor,0.0,43.571429
educator,39.115385,43.101449
engineer,29.5,36.6
entertainment,31.0,29.0
executive,44.0,38.172414
healthcare,39.818182,45.4
homemaker,34.166667,23.0
lawyer,39.5,36.2


### Step 8.  For each occupation present the percentage of women and men

In [145]:
gen_occ.agg(lambda x: round(x*100/x.sum(),1), axis=1)

gender,F,M
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,45.6,54.4
artist,46.4,53.6
doctor,0.0,100.0
educator,27.4,72.6
engineer,3.0,97.0
entertainment,11.1,88.9
executive,9.4,90.6
healthcare,68.8,31.2
homemaker,85.7,14.3
lawyer,16.7,83.3
