# transformations

Overview of all transformations supported by AVATAR.

In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

Load some datasets.

In [2]:
nba = pd.read_csv("../../data/raw/demo/nba.csv")
display(nba.head())

automobile = pd.read_csv("../../data/raw/demo/automobile.csv", header=None)
display(automobile.head())

titanic = pd.read_csv("../../data/raw/demo/titanic.csv")
display(titanic.head())

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_AGE,...,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS,a_lbl
0,2018-19,LAL,1628398,Kyle Kuzma,0,F,6-9,220,"JUL 24, 1995",23,...,0.303,188,250,0.752,382,178,41,26,1308,0
1,2018-19,LAL,203484,Kentavious Caldwell-Pope,1,G,6-5,205,"FEB 18, 1993",26,...,0.347,137,158,0.867,238,110,73,13,938,0
2,2018-19,LAL,1628366,Lonzo Ball,2,G,6-6,190,"OCT 27, 1997",21,...,0.329,20,48,0.417,251,255,69,19,465,0
3,2018-19,LAL,1628404,Josh Hart,3,G,6-5,215,"MAR 06, 1995",24,...,0.336,55,80,0.688,248,93,64,40,525,0
4,2018-19,LAL,1627936,Alex Caruso,4,G,6-11,186,"FEB 28, 1994",25,...,0.48,51,64,0.797,67,77,24,9,229,0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
from avatar.language import WranglingLanguage
from avatar.filter import *
from avatar.expand import Expander
from avatar.settings import Settings

Settings.verbose = False


def demo(transformation, column):
    expander = Expander(WranglingLanguage([transformation]))
    expander.prune_transformation = MissingFilter(0.9)
    display(expander.expand(column.to_frame()).head())


# demo(ExtractNumberK, nba.SEASON_ID)

## String

String to string transformations.

In [16]:
from avatar.transformations.string import Split


Split.arguments(nba.BIRTH_DATE)

{(' ',), (', ',)}

In [18]:
from avatar.transformations.string import SplitAlign

In [19]:
SplitAlign.arguments(nba.POSITION)

[('-',)]

In [4]:
from avatar.transformations.string import ExtractWord

demo(ExtractWord, titanic.Name)

Unnamed: 0,Name,"ExtractWord([Dr, Master, Miss, Mr, Mrs, Rev])(Name)_0"
0,"Braund, Mr. Owen Harris",Mr
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs
2,"Heikkinen, Miss. Laina",Miss
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs
4,"Allen, Mr. William Henry",Mr


In [5]:
from avatar.transformations.string import ExtractBoolean

demo(ExtractBoolean, titanic.Name)

Unnamed: 0,Name,ExtractBoolean(Miss)(Name)_Name,ExtractBoolean(Mrs)(Name)_Name,ExtractBoolean(Mr)(Name)_Name,ExtractBoolean(William)(Name)_Name
0,"Braund, Mr. Owen Harris",False,False,True,False
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",False,True,False,False
2,"Heikkinen, Miss. Laina",True,False,False,False
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",False,True,False,False
4,"Allen, Mr. William Henry",False,False,True,True


In [54]:
ExtractBoolean("Miss")(titanic.Name)

Unnamed: 0,Name
0,False
1,False
2,True
3,False
4,False
...,...
886,False
887,True
888,True
889,False


In [53]:
ExtractWord.arguments(titanic.Name)

[({'Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Rev'},)]

In [33]:
ew = ExtractWord(ewa[0])

In [87]:
titanic.Cabin.drop_duplicates()

0       NaN
1       C85
3      C123
6       E46
10       G6
       ... 
857     E17
867     A24
879     C50
887     B42
889    C148
Name: Cabin, Length: 148, dtype: object

In [34]:
ew(titanic.Name)

Unnamed: 0,0
0,Mr
1,Mr
2,Miss
3,Mr
4,Mr
...,...
882,Rev
883,Miss
884,Miss
885,Mr


### ExtractNumber

Extract numbers from text.

In [26]:
from avatar.transformations.string import ExtractNumberPattern


demo(ExtractNumberPattern, nba["SEASON_ID"])

Unnamed: 0,SEASON_ID,ExtractNumberPattern((\d{4}))(SEASON_ID)_0
0,2018-19,2018.0
1,2018-19,2018.0
2,2018-19,2018.0
3,2018-19,2018.0
4,2018-19,2018.0


In [28]:
demo(ExtractNumberK, nba["BIRTH_DATE"])

Unnamed: 0,BIRTH_DATE,ExtractNumberK(0)(BIRTH_DATE)_0,ExtractNumberK(1)(BIRTH_DATE)_1
0,"JUL 24, 1995",24.0,1995.0
1,"FEB 18, 1993",18.0,1993.0
2,"OCT 27, 1997",27.0,1997.0
3,"MAR 06, 1995",6.0,1995.0
4,"FEB 28, 1994",28.0,1994.0


## semantic

In [19]:
from avatar.transformations.semantic import WordToNumber

In [57]:
import itertools

list(itertools.product(*words))

[('alfa',
  'audi',
  'bmw',
  'chevrolet',
  'dodge',
  'honda',
  'isuzu',
  'jaguar',
  'mazda',
  'benz',
  'mercury',
  'mitsubishi',
  'nissan',
  'peugot',
  'plymouth',
  'porsche',
  'renault',
  'saab',
  'subaru',
  'toyota',
  'volkswagen',
  'volvo'),
 ('alfa',
  'audi',
  'bmw',
  'chevrolet',
  'dodge',
  'honda',
  'isuzu',
  'jaguar',
  'mazda',
  'mercedes',
  'mercury',
  'mitsubishi',
  'nissan',
  'peugot',
  'plymouth',
  'porsche',
  'renault',
  'saab',
  'subaru',
  'toyota',
  'volkswagen',
  'volvo'),
 ('romero',
  'audi',
  'bmw',
  'chevrolet',
  'dodge',
  'honda',
  'isuzu',
  'jaguar',
  'mazda',
  'benz',
  'mercury',
  'mitsubishi',
  'nissan',
  'peugot',
  'plymouth',
  'porsche',
  'renault',
  'saab',
  'subaru',
  'toyota',
  'volkswagen',
  'volvo'),
 ('romero',
  'audi',
  'bmw',
  'chevrolet',
  'dodge',
  'honda',
  'isuzu',
  'jaguar',
  'mazda',
  'mercedes',
  'mercury',
  'mitsubishi',
  'nissan',
  'peugot',
  'plymouth',
  'porsche',
  '

In [59]:
display(WordToNumber.arguments(automobile[5]))
display(WordToNumber.arguments(automobile[4]))

w = WordToNumber()
w(automobile[5])

[()]

[]

Unnamed: 0,5
0,2.0
1,2.0
2,2.0
3,4.0
4,4.0
...,...
200,4.0
201,4.0
202,4.0
203,4.0


In [21]:
WordToNumber()(merch['merchant_info_subtitle'])

Unnamed: 0,merchant_info_subtitle
0,
1,
2,
3,
4,
...,...
153,
154,
155,
156,


## type

### numerical

Can only apply if column contains numerical data.

In [54]:
from avatar.transformations.type import Numerical

Numerical.arguments(nba.HEIGHT)

[]

In [55]:
Numerical.arguments(nba.WEIGHT)

AttributeError: Can only use .str accessor with string values!

In [158]:
nba.WEIGHT.dtype

dtype('int64')

In [159]:
nba.FG3_PCT.dtype

dtype('float64')

In [163]:
def p(v: str):
    return v + "_"

automobile.brand.map(p)

0      alfa-romero_
1      alfa-romero_
2      alfa-romero_
3             audi_
4             audi_
           ...     
200          volvo_
201          volvo_
202          volvo_
203          volvo_
204          volvo_
Name: brand, Length: 205, dtype: category
Categories (22, object): [alfa-romero_, audi_, bmw_, chevrolet_, ..., subaru_, toyota_, volkswagen_, volvo_]

In [164]:
automobile.brand.str.split("-")

0      [alfa, romero]
1      [alfa, romero]
2      [alfa, romero]
3              [audi]
4              [audi]
            ...      
200           [volvo]
201           [volvo]
202           [volvo]
203           [volvo]
204           [volvo]
Name: brand, Length: 205, dtype: object

## encoding

Change encoding of variables.

In [6]:
from avatar.transformations.encoding import OneHot, NaN

In [64]:
OneHot.arguments(automobile[2])

True


[()]

In [8]:
NaN.arguments(automobile[2])

toyota 32
nissan 18
mazda 17
mitsubishi 13
honda 13
subaru 12
volkswagen 12
volvo 11
peugot 11
dodge 9
mercedes-benz 8
bmw 8
audi 7
plymouth 7
saab 6
porsche 5
isuzu 4
alfa-romero 3
chevrolet 3
jaguar 3
renault 2
mercury 1


[('alfa-romero',),
 ('audi',),
 ('bmw',),
 ('chevrolet',),
 ('dodge',),
 ('honda',),
 ('isuzu',),
 ('jaguar',),
 ('mazda',),
 ('mercedes-benz',),
 ('mercury',),
 ('mitsubishi',),
 ('nissan',),
 ('peugot',),
 ('plymouth',),
 ('porsche',),
 ('renault',),
 ('saab',),
 ('subaru',),
 ('toyota',),
 ('volkswagen',),
 ('volvo',)]

In [10]:
NaN.arguments(automobile[1])

[('?',),
 ('161',),
 ('91',),
 ('150',),
 ('128',),
 ('104',),
 ('134',),
 ('74',),
 ('65',),
 ('102',),
 ('103',),
 ('168',),
 ('95',),
 ('85',),
 ('94',),
 ('148',),
 ('118',),
 ('122',),
 ('106',),
 ('93',),
 ('83',),
 ('125',),
 ('115',),
 ('137',),
 ('154',),
 ('101',),
 ('119',),
 ('129',),
 ('192',),
 ('87',),
 ('194',),
 ('164',),
 ('89',),
 ('81',),
 ('158',),
 ('153',),
 ('113',),
 ('110',),
 ('145',),
 ('188',),
 ('197',),
 ('108',)]

### drop

In [3]:
from avatar.transformations.special import Drop
from avatar.language import WranglingTransformation

In [18]:
wt = WranglingTransformation("PassengerId", Drop(), replace=True)
wt(titanic)

True


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
