# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user).

### Step 3. Assign it to a variable called users.

In [2]:
users = pd.read_csv("https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user", sep="|")
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Step 4. Discover what is the mean age per occupation

In [7]:
users.groupby(by=["occupation"])["age"] \
      .mean() \
      .astype(int)

Unnamed: 0_level_0,age
occupation,Unnamed: 1_level_1
administrator,38
artist,31
doctor,43
educator,42
engineer,36
entertainment,29
executive,38
healthcare,41
homemaker,32
lawyer,36


### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [10]:
sub = users.groupby(by="occupation") \
           .agg(
               hombres=("gender", lambda x: (x == "M").sum()),
               total=("user_id", "count")
           )
sub

Unnamed: 0_level_0,hombres,total
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,43,79
artist,15,28
doctor,7,7
educator,69,95
engineer,65,67
entertainment,16,18
executive,29,32
healthcare,5,16
homemaker,1,7
lawyer,10,12


In [14]:
sub["ratio_hombres"] = round(sub["hombres"] / sub["total"], 4) * 100
sub

Unnamed: 0_level_0,hombres,total,ratio_hombres
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
administrator,43,79,54.43
artist,15,28,53.57
doctor,7,7,100.0
educator,69,95,72.63
engineer,65,67,97.01
entertainment,16,18,88.89
executive,29,32,90.62
healthcare,5,16,31.25
homemaker,1,7,14.29
lawyer,10,12,83.33


### Step 6. For each occupation, calculate the minimum and maximum ages

In [15]:
ages = users.groupby(by="occupation") \
     .agg(
         min_age=("age", "min"),
         max_age=("age", "max")
     ).reset_index()
ages

Unnamed: 0,occupation,min_age,max_age
0,administrator,21,70
1,artist,19,48
2,doctor,28,64
3,educator,23,63
4,engineer,22,70
5,entertainment,15,50
6,executive,22,69
7,healthcare,22,62
8,homemaker,20,50
9,lawyer,21,53


### Step 7. For each combination of occupation and gender, calculate the mean age

In [18]:
users.groupby(by=["occupation", "gender"])["age"].mean().astype(int)

Unnamed: 0_level_0,Unnamed: 1_level_0,age
occupation,gender,Unnamed: 2_level_1
administrator,F,40
administrator,M,37
artist,F,30
artist,M,32
doctor,M,43
educator,F,39
educator,M,43
engineer,F,29
engineer,M,36
entertainment,F,31


### Step 8.  For each occupation present the percentage of women and men

In [24]:
gender_groups = users.groupby(by="occupation") \
     .agg(
         total=("gender", "count"),
         women=("gender", lambda x: (x == "F").sum()),
         men=("gender", lambda x: (x == "M").sum())
     )
gender_groups["women_percentage"] = round(gender_groups["women"] / gender_groups["total"], 4) * 100
gender_groups["men_percentage"] = round(gender_groups["men"] / gender_groups["total"], 4) * 100
gender_groups

Unnamed: 0_level_0,total,women,men,women_percentage,men_percentage
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
administrator,79,36,43,45.57,54.43
artist,28,13,15,46.43,53.57
doctor,7,0,7,0.0,100.0
educator,95,26,69,27.37,72.63
engineer,67,2,65,2.99,97.01
entertainment,18,2,16,11.11,88.89
executive,32,3,29,9.38,90.62
healthcare,16,11,5,68.75,31.25
homemaker,7,6,1,85.71,14.29
lawyer,12,2,10,16.67,83.33
