# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [2]:
users = pd.read_table('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user', 
                      sep='|', index_col='user_id')

In [6]:
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


### Step 4. Discover what is the mean age per occupation

In [3]:
users.columns

Index(['age', 'gender', 'occupation', 'zip_code'], dtype='object')

In [4]:
users.groupby('occupation').age.mean()

occupation
administrator    38.746835
artist           31.392857
doctor           43.571429
educator         42.010526
engineer         36.388060
entertainment    29.222222
executive        38.718750
healthcare       41.562500
homemaker        32.571429
lawyer           36.750000
librarian        40.000000
marketing        37.615385
none             26.555556
other            34.523810
programmer       33.121212
retired          63.071429
salesman         35.666667
scientist        35.548387
student          22.081633
technician       33.148148
writer           36.311111
Name: age, dtype: float64

### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [60]:
users['male_dummy'] = users['gender'].apply(lambda x: 1 if x=='M' else 0)
male_ratio = users.groupby('occupation').male_dummy.sum()/users.groupby('occupation').age.count()
male_df = male_ratio.sort_values(ascending=False).to_frame().reset_index()
male_df.columns
male_df.rename(columns={0:'ratio'})




Unnamed: 0,occupation,ratio
0,doctor,1.0
1,engineer,0.970149
2,technician,0.962963
3,retired,0.928571
4,programmer,0.909091
5,executive,0.90625
6,scientist,0.903226
7,entertainment,0.888889
8,lawyer,0.833333
9,salesman,0.75


### Step 6. For each occupation, calculate the minimum and maximum ages

In [61]:
users.groupby('occupation').age.min()

occupation
administrator    21
artist           19
doctor           28
educator         23
engineer         22
entertainment    15
executive        22
healthcare       22
homemaker        20
lawyer           21
librarian        23
marketing        24
none             11
other            13
programmer       20
retired          51
salesman         18
scientist        23
student           7
technician       21
writer           18
Name: age, dtype: int64

In [62]:
users.groupby('occupation').age.max()

occupation
administrator    70
artist           48
doctor           64
educator         63
engineer         70
entertainment    50
executive        69
healthcare       62
homemaker        50
lawyer           53
librarian        69
marketing        55
none             55
other            64
programmer       63
retired          73
salesman         66
scientist        55
student          42
technician       55
writer           60
Name: age, dtype: int64

### Step 7. For each combination of occupation and gender, calculate the mean age

In [64]:
users.groupby(['occupation','gender']).age.mean()

occupation     gender
administrator  F         40.638889
               M         37.162791
artist         F         30.307692
               M         32.333333
doctor         M         43.571429
educator       F         39.115385
               M         43.101449
engineer       F         29.500000
               M         36.600000
entertainment  F         31.000000
               M         29.000000
executive      F         44.000000
               M         38.172414
healthcare     F         39.818182
               M         45.400000
homemaker      F         34.166667
               M         23.000000
lawyer         F         39.500000
               M         36.200000
librarian      F         40.000000
               M         40.000000
marketing      F         37.200000
               M         37.875000
none           F         36.500000
               M         18.600000
other          F         35.472222
               M         34.028986
programmer     F         32.16666

### Step 8.  For each occupation present the percentage of women and men

In [87]:
users['female_dummy'] = users['gender'].apply(lambda x: 1 if x=='F' else 0)
female_ratio = users.groupby('occupation').female_dummy.sum()/users.groupby('occupation').age.count()
female_df = female_ratio.sort_values(ascending=False).to_frame().reset_index()


In [88]:
female_df = female_df.rename(columns={0:'f_ratio'})

In [89]:
print(female_df)

       occupation   f_ratio
0       homemaker  0.857143
1      healthcare  0.687500
2       librarian  0.568627
3          artist  0.464286
4   administrator  0.455696
5            none  0.444444
6          writer  0.422222
7       marketing  0.384615
8           other  0.342857
9         student  0.306122
10       educator  0.273684
11       salesman  0.250000
12         lawyer  0.166667
13  entertainment  0.111111
14      scientist  0.096774
15      executive  0.093750
16     programmer  0.090909
17        retired  0.071429
18     technician  0.037037
19       engineer  0.029851
20         doctor  0.000000


In [100]:
combined_df = male_df

In [101]:
combined_df = combined_df.rename(columns={0:'m_ratio'})
combined_df = pd.merge(combined_df, female_df)

In [107]:
combined_df['m_ratio'] = combined_df.m_ratio.apply(lambda x: x*100)
combined_df['f_ratio'] = combined_df.f_ratio.apply(lambda x: x*100)

In [108]:
print(combined_df)

       occupation     m_ratio    f_ratio
0          doctor  100.000000   0.000000
1        engineer   97.014925   2.985075
2      technician   96.296296   3.703704
3         retired   92.857143   7.142857
4      programmer   90.909091   9.090909
5       executive   90.625000   9.375000
6       scientist   90.322581   9.677419
7   entertainment   88.888889  11.111111
8          lawyer   83.333333  16.666667
9        salesman   75.000000  25.000000
10       educator   72.631579  27.368421
11        student   69.387755  30.612245
12          other   65.714286  34.285714
13      marketing   61.538462  38.461538
14         writer   57.777778  42.222222
15           none   55.555556  44.444444
16  administrator   54.430380  45.569620
17         artist   53.571429  46.428571
18      librarian   43.137255  56.862745
19     healthcare   31.250000  68.750000
20      homemaker   14.285714  85.714286
