In [1]:
import pandas as pd

# Define column names as per UCI Adult dataset description
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
    'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
    'hours_per_week', 'native_country', 'income']

# Load the dataset from a URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
df = pd.read_csv(url, names=column_names, na_values=' ?', skipinitialspace=True)

print("UCI Adult Dataset:")
print(df.head())

UCI Adult Dataset:
   age         workclass  fnlwgt  education  education_num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital_status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital_gain  capital_loss  hours_per_week native_country income  
0          2174             0              40  United-States  <=50K  
1             0        

In [3]:
#### 2 - Using groupby, transform, apply functions, and pivot tables:
#### 2.1 - Simple grouping: Group by 'education' and calculate the mean age and mean hours per week. Use groupby and agg to calculate the mean values.
simple_grouped = df.groupby(['education'], observed=True).agg({
    'age': ['mean'],
    'hours_per_week': ['mean']
})
print(simple_grouped)

                    age hours_per_week
                   mean           mean
education                             
10th          37.429796      37.052519
11th          32.355745      33.925957
12th          32.000000      35.780600
1st-4th       46.142857      38.255952
5th-6th       42.885886      38.897898
7th-8th       48.445820      39.366873
9th           41.060311      38.044747
Assoc-acdm    37.381443      40.504217
Assoc-voc     38.553546      41.610709
Bachelors     38.904949      42.614006
Doctorate     47.702179      46.973366
HS-grad       38.974479      40.575374
Masters       44.049913      43.836332
Preschool     42.764706      36.647059
Prof-school   44.746528      47.425347
Some-college  35.756275      38.852284


In [4]:
### 2.2 - Multiple grouping: Group by 'education' and 'sex', and calculate sum, count, and mean for 'hours_per_week'.
####      Use groupby and agg with multiple aggregation functions.
multiple_grouped = df.groupby(['education','sex'], observed=True).agg({
    'hours_per_week': ['sum','count','mean']
})
print(multiple_grouped)

                    hours_per_week                 
                               sum count       mean
education    sex                                   
10th         Female           9473   295  32.111864
             Male            25097   638  39.336991
11th         Female          12883   432  29.821759
             Male            26980   743  36.312248
12th         Female           4578   144  31.791667
             Male            10915   289  37.768166
1st-4th      Female           1471    46  31.978261
             Male             4956   122  40.622951
5th-6th      Female           3028    84  36.047619
             Male             9925   249  39.859438
7th-8th      Female           5792   160  36.200000
             Male            19639   486  40.409465
9th          Female           4884   144  33.916667
             Male            14671   370  39.651351
Assoc-acdm   Female          15728   421  37.358670
             Male            27490   646  42.554180
Assoc-voc   

In [5]:
### 2.3 - Applying functions: Group by 'marital_status' and apply a custom function to find the range of 'age'.
###       Use groupby and apply with a custom function.
# Custom function to calculate the range
def calc_range(x):
    return x.max() - x.min()

# Group by 'marital_status', then apply the custom function to 'age'
custom_grouped = df.groupby(['marital_status'], observed=True)['age'].apply(calc_range)

print(custom_grouped)

marital_status
Divorced                 72
Married-AF-spouse        56
Married-civ-spouse       73
Married-spouse-absent    62
Never-married            73
Separated                72
Widowed                  72
Name: age, dtype: int64


In [6]:
### 2.4 - Group transforms: Group by 'occupation' and transform to find the mean age within each group.
###       Use groupby and transform to add a new column with transformed data.

### 1.4 - Group Transforms: Group by 'sex' and 'class', then calculate the z-score of 'fare' within each group.
from scipy.stats import zscore

# Calculate the z-score of 'fare' within each group
df['age_mean'] = df.groupby(['occupation'], observed=True)['age'].transform('mean')

print(df.head())

   age         workclass  fnlwgt  education  education_num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital_status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital_gain  capital_loss  hours_per_week native_country income   age_mean  
0          2174             0              40  United-States  <=50K  36.964456  
1             0     

In [8]:
#### 2.5 - Creating a pivot table: Create a pivot table to show the mean 'hours_per_week' for each 'occupation' and 'sex'.
####       Use pivot_table to create a pivot table with specified aggregation.
pivot_table = df.pivot_table(values='hours_per_week', index='occupation', columns='sex', aggfunc='mean')

print(pivot_table)

sex                   Female       Male
occupation                             
?                  29.976219  33.525948
Adm-clerical       36.741033  39.240065
Armed-Forces             NaN  40.666667
Craft-repair       39.869369  42.443642
Exec-managerial    41.517688  46.371173
Farming-fishing    37.784615  47.634015
Handlers-cleaners  36.103659  38.198176
Machine-op-inspct  38.929091  41.447658
Other-service      33.437778  36.223411
Priv-house-serv    32.489362  39.875000
Prof-specialty     39.423762  44.096762
Protective-serv    38.526316  43.446771
Sales              34.274743  44.223712
Tech-support       37.295977  40.713793
Transport-moving   36.711111  45.130723
