In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [14]:
state_fruit = pd.read_csv("data/state_fruit.csv", index_col=0)
state_fruit

Unnamed: 0,Apple,Orange,Banana
Texas,12,10,40
Arizona,9,7,12
Florida,0,14,190


In [15]:
state_fruit = state_fruit.stack().reset_index()
state_fruit.columns = ['State','Fruit','Weight']
state_fruit

Unnamed: 0,State,Fruit,Weight
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


In [24]:
state_fruit = pd.read_csv("data/state_fruit.csv", index_col=0)
state_fruit.stack().rename_axis(['State','Fruit']).reset_index(name='Weight')
#state_fruit

Unnamed: 0,State,Fruit,Weight
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


# Tidying variable values as column names with melt

In [25]:
state_fruit2 = pd.read_csv("data/state_fruit2.csv")
state_fruit2

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [26]:
state_fruit2.melt(id_vars='State',
                value_vars=['Apple','Orange','Banana'],
                 value_name='Weight',
                  var_name = 'Fruit'
                     )

Unnamed: 0,State,Fruit,Weight
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [10]:
state_fruit2.melt(id_vars=['State'],
                  value_vars = ['Apple','Orange','Banana'],
                  var_name = 'Fruit',
                  value_name = 'Weight'
                        )

Unnamed: 0,State,Fruit,Weight
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [11]:
state_fruit2.melt()

Unnamed: 0,variable,value
0,State,Texas
1,State,Arizona
2,State,Florida
3,Apple,12
4,Apple,9
5,Apple,0
6,Orange,10
7,Orange,7
8,Orange,14
9,Banana,40


In [12]:
state_fruit2.melt(id_vars='State')

Unnamed: 0,State,variable,value
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


# Stacking multiple groups of variables simultaneously

In [27]:
movie = pd.read_csv('data/movie.csv')
movie.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [33]:
actor = movie.loc[:,['movie_title', 
       'actor_1_name','actor_1_facebook_likes','actor_2_name','actor_2_facebook_likes',
              'actor_3_name','actor_3_facebook_likes']]
actor.head()

Unnamed: 0,movie_title,actor_1_name,actor_1_facebook_likes,actor_2_name,actor_2_facebook_likes,actor_3_name,actor_3_facebook_likes
0,Avatar,CCH Pounder,1000.0,Joel David Moore,936.0,Wes Studi,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,40000.0,Orlando Bloom,5000.0,Jack Davenport,1000.0
2,Spectre,Christoph Waltz,11000.0,Rory Kinnear,393.0,Stephanie Sigman,161.0
3,The Dark Knight Rises,Tom Hardy,27000.0,Christian Bale,23000.0,Joseph Gordon-Levitt,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,131.0,Rob Walker,12.0,,


In [36]:
def change_col_name(col_name):
    col_name = col_name.replace('_name', '')
    if 'facebook' in col_name:
        fb_idx = col_name.find('facebook')
        col_name = col_name[:5] + col_name[fb_idx - 1:] + col_name[5:fb_idx-1]
    return col_name

In [37]:
actor2 = actor.rename(columns=change_col_name)
actor2.head()

Unnamed: 0,movie_title,actor_1,actor_facebook_likes_1,actor_2,actor_facebook_likes_2,actor_3,actor_facebook_likes_3
0,Avatar,CCH Pounder,1000.0,Joel David Moore,936.0,Wes Studi,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,40000.0,Orlando Bloom,5000.0,Jack Davenport,1000.0
2,Spectre,Christoph Waltz,11000.0,Rory Kinnear,393.0,Stephanie Sigman,161.0
3,The Dark Knight Rises,Tom Hardy,27000.0,Christian Bale,23000.0,Joseph Gordon-Levitt,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,131.0,Rob Walker,12.0,,


In [37]:
stubs = ['actor', 'actor_facebook_likes']
actor2_tidy = pd.wide_to_long(actor2,
                                stubnames=stubs,
                                i=['movie_title'],
                                j='actor_num',
                                sep='_')
actor2_tidy.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,actor,actor_facebook_likes
movie_title,actor_num,Unnamed: 2_level_1,Unnamed: 3_level_1
Avatar,1,CCH Pounder,1000.0
Pirates of the Caribbean: At World's End,1,Johnny Depp,40000.0
Spectre,1,Christoph Waltz,11000.0
The Dark Knight Rises,1,Tom Hardy,27000.0
Star Wars: Episode VII - The Force Awakens,1,Doug Walker,131.0


In [39]:
df = pd.read_csv('data/stackme.csv')
df

Unnamed: 0,State,Country,a1,b2,Test,d,e
0,TX,US,0.45,0.3,Test1,2,6
1,MA,US,0.03,1.2,Test2,9,7
2,ON,CAN,0.7,4.2,Test3,4,2


In [47]:
df2 = df.rename(columns={'a1':'group1_a1', 'b2':"group1_b2", 'd':'group2_a1','e':"group2_b2"})
pd.wide_to_long(df2,
                stubnames = ['group1','group2'],
               i = ['State','Country','Test'],
               j = 'Label',
               suffix = '.+',
               sep = '_').reset_index()

Unnamed: 0,State,Country,Test,Label,group1,group2
0,TX,US,Test1,a1,0.45,2
1,TX,US,Test1,b2,0.3,6
2,MA,US,Test2,a1,0.03,9
3,MA,US,Test2,b2,1.2,7
4,ON,CAN,Test3,a1,0.7,4
5,ON,CAN,Test3,b2,4.2,2


# Inverting Stacked data

In [39]:
usecol_func = lambda x: 'UGDS_' in x or x == 'INSTNM'
college = pd.read_csv('data/college.csv', usecols=usecol_func, index_col='INSTNM')
college.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [54]:
college.stack().rename_axis(['InstituteName','Race']).reset_index(name = 'Percentage')


Unnamed: 0,InstituteName,Race,Percentage
0,Alabama A & M University,UGDS_WHITE,0.0333
1,Alabama A & M University,UGDS_BLACK,0.9353
2,Alabama A & M University,UGDS_HISP,0.0055
3,Alabama A & M University,UGDS_ASIAN,0.0019
4,Alabama A & M University,UGDS_AIAN,0.0024
...,...,...,...
61861,Coastal Pines Technical College,UGDS_AIAN,0.0034
61862,Coastal Pines Technical College,UGDS_NHPI,0.0017
61863,Coastal Pines Technical College,UGDS_2MOR,0.0191
61864,Coastal Pines Technical College,UGDS_NRA,0.0028


In [57]:
college.reset_index().melt(id_vars='INSTNM', var_name="Race", value_name = "Percentage").dropna(how='any')


Unnamed: 0,INSTNM,Race,Percentage
0,Alabama A & M University,UGDS_WHITE,0.0333
1,University of Alabama at Birmingham,UGDS_WHITE,0.5922
2,Amridge University,UGDS_WHITE,0.2990
3,University of Alabama in Huntsville,UGDS_WHITE,0.6988
4,Alabama State University,UGDS_WHITE,0.0158
...,...,...,...
67439,Hollywood Institute of Beauty Careers-West Pal...,UGDS_UNKN,0.0909
67440,Hollywood Institute of Beauty Careers-Casselberry,UGDS_UNKN,0.0667
67441,Coachella Valley Beauty College-Beaumont,UGDS_UNKN,0.0000
67442,Dewey University-Mayaguez,UGDS_UNKN,0.0000


In [63]:
college.melt(id_vars = 'INSTNM', value_name="Percentage",var_name="Race")\
        . pivot(index='INSTNM',
            columns='Race',
            values='Percentage')

Race,UGDS_2MOR,UGDS_AIAN,UGDS_ASIAN,UGDS_BLACK,UGDS_HISP,UGDS_NHPI,UGDS_NRA,UGDS_UNKN,UGDS_WHITE
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A & W Healthcare Educators,0.0000,0.0000,0.0000,0.9750,0.0250,0.0000,0.0000,0.0000,0.0000
A T Still University of Health Sciences,,,,,,,,,
ABC Beauty Academy,0.0000,0.0000,0.9333,0.0333,0.0333,0.0000,0.0000,0.0000,0.0000
ABC Beauty College Inc,0.0000,0.0000,0.0000,0.6579,0.0526,0.0000,0.0000,0.0000,0.2895
AI Miami International University of Art and Design,0.0018,0.0000,0.0018,0.0198,0.4773,0.0000,0.0025,0.4644,0.0324
...,...,...,...,...,...,...,...,...,...
Yukon Beauty College Inc,0.0000,0.1200,0.0000,0.0400,0.0000,0.0400,0.0000,0.0000,0.8000
Z Hair Academy,0.0211,0.0000,0.0000,0.0000,0.0211,0.0105,0.0000,0.0105,0.9368
Zane State College,0.0218,0.0029,0.0029,0.0296,0.0029,0.0005,0.0000,0.2399,0.6995
duCret School of Arts,0.0976,0.0000,0.0732,0.1951,0.1463,0.0000,0.0000,0.0244,0.4634


# Unstacking after a groupby aggregation

In [60]:
employee = pd.read_csv("data/employee.csv", index_col=0)
employee.head()

Unnamed: 0_level_0,POSITION_TITLE,DEPARTMENT,BASE_SALARY,RACE,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE
UNIQUE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,Hispanic/Latino,Full Time,Female,Active,2006-06-12,2012-10-13
1,LIBRARY ASSISTANT,Library,26125.0,Hispanic/Latino,Full Time,Female,Active,2000-07-19,2010-09-18
2,POLICE OFFICER,Houston Police Department-HPD,45279.0,White,Full Time,Male,Active,2015-02-03,2015-02-03
3,ENGINEER/OPERATOR,Houston Fire Department (HFD),63166.0,White,Full Time,Male,Active,1982-02-08,1991-05-25
4,ELECTRICIAN,General Services Department,56347.0,White,Full Time,Male,Active,1989-06-19,1994-10-22


In [63]:
employee.groupby('RACE')['BASE_SALARY'].agg(['mean']).astype(np.int32)

Unnamed: 0_level_0,mean
RACE,Unnamed: 1_level_1
American Indian or Alaskan Native,60272
Asian/Pacific Islander,61660
Black or African American,50137
Hispanic/Latino,52345
Others,51278
White,64419


In [68]:
agg = employee.groupby(['RACE','GENDER'])['BASE_SALARY'].mean().astype('int32')
agg

RACE                               GENDER
American Indian or Alaskan Native  Female    60238
                                   Male      60305
Asian/Pacific Islander             Female    63226
                                   Male      61033
Black or African American          Female    48915
                                   Male      51082
Hispanic/Latino                    Female    46503
                                   Male      54782
Others                             Female    63785
                                   Male      38771
White                              Female    66793
                                   Male      63940
Name: BASE_SALARY, dtype: int32

In [73]:
agg.unstack('GENDER').style.highlight_max(axis = 'columns')

GENDER,Female,Male
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1
American Indian or Alaskan Native,60238,60305
Asian/Pacific Islander,63226,61033
Black or African American,48915,51082
Hispanic/Latino,46503,54782
Others,63785,38771
White,66793,63940
