# transformations

Overview of all transformations supported by AVATAR.

In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

Load some datasets.

In [2]:
nba = pd.read_csv("../../data/raw/demo/nba.csv")
display(nba.head())

automobile = pd.read_csv("../../data/raw/demo/automobile.csv", header=None)
display(automobile.head())

titanic = pd.read_csv("../../data/raw/demo/titanic.csv")
display(titanic.head())

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,PLAYER_ID,PLAYER,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,PLAYER_AGE,...,FG3_PCT,FTM,FTA,FT_PCT,REB,AST,STL,BLK,PTS,a_lbl
0,2018-19,LAL,1628398,Kyle Kuzma,0,F,6-9,220,"JUL 24, 1995",23,...,0.303,188,250,0.752,382,178,41,26,1308,0
1,2018-19,LAL,203484,Kentavious Caldwell-Pope,1,G,6-5,205,"FEB 18, 1993",26,...,0.347,137,158,0.867,238,110,73,13,938,0
2,2018-19,LAL,1628366,Lonzo Ball,2,G,6-6,190,"OCT 27, 1997",21,...,0.329,20,48,0.417,251,255,69,19,465,0
3,2018-19,LAL,1628404,Josh Hart,3,G,6-5,215,"MAR 06, 1995",24,...,0.336,55,80,0.688,248,93,64,40,525,0
4,2018-19,LAL,1627936,Alex Caruso,4,G,6-11,186,"FEB 28, 1994",25,...,0.48,51,64,0.797,67,77,24,9,229,0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
from avatar.language import WranglingLanguage
from avatar.filter import *
from avatar.expand import Expander
from avatar.settings import Settings

Settings.verbose = False


def demo(transformation, column):
    expander = Expander(WranglingLanguage([transformation]))
    expander.prune_transformation = MissingFilter(0.9)
    display(expander.expand(column.to_frame()).head())


# demo(ExtractNumberK, nba.SEASON_ID)

## String

String to string transformations.

In [16]:
from avatar.transformations.string import Split


Split.arguments(nba.BIRTH_DATE)

{(' ',), (', ',)}

In [18]:
from avatar.transformations.string import SplitAlign

In [19]:
SplitAlign.arguments(nba.POSITION)

[('-',)]

In [4]:
from avatar.transformations.string import ExtractWord

demo(ExtractWord, titanic.Name)

Unnamed: 0,Name,"ExtractWord([Dr, Master, Miss, Mr, Mrs, Rev])(Name)_0"
0,"Braund, Mr. Owen Harris",Mr
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs
2,"Heikkinen, Miss. Laina",Miss
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs
4,"Allen, Mr. William Henry",Mr


In [5]:
from avatar.transformations.string import ExtractBoolean

demo(ExtractBoolean, titanic.Name)

Unnamed: 0,Name,ExtractBoolean(Miss)(Name)_Name,ExtractBoolean(Mrs)(Name)_Name,ExtractBoolean(Mr)(Name)_Name,ExtractBoolean(William)(Name)_Name
0,"Braund, Mr. Owen Harris",False,False,True,False
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",False,True,False,False
2,"Heikkinen, Miss. Laina",True,False,False,False
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",False,True,False,False
4,"Allen, Mr. William Henry",False,False,True,True


In [54]:
ExtractBoolean("Miss")(titanic.Name)

Unnamed: 0,Name
0,False
1,False
2,True
3,False
4,False
...,...
886,False
887,True
888,True
889,False


In [53]:
ExtractWord.arguments(titanic.Name)

[({'Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Rev'},)]

In [33]:
ew = ExtractWord(ewa[0])

In [87]:
titanic.Cabin.drop_duplicates()

0       NaN
1       C85
3      C123
6       E46
10       G6
       ... 
857     E17
867     A24
879     C50
887     B42
889    C148
Name: Cabin, Length: 148, dtype: object

In [34]:
ew(titanic.Name)

Unnamed: 0,0
0,Mr
1,Mr
2,Miss
3,Mr
4,Mr
...,...
882,Rev
883,Miss
884,Miss
885,Mr


### ExtractNumber

Extract numbers from text.

In [26]:
from avatar.transformations.string import ExtractNumberPattern


demo(ExtractNumberPattern, nba["SEASON_ID"])

Unnamed: 0,SEASON_ID,ExtractNumberPattern((\d{4}))(SEASON_ID)_0
0,2018-19,2018.0
1,2018-19,2018.0
2,2018-19,2018.0
3,2018-19,2018.0
4,2018-19,2018.0


In [28]:
demo(ExtractNumberK, nba["BIRTH_DATE"])

Unnamed: 0,BIRTH_DATE,ExtractNumberK(0)(BIRTH_DATE)_0,ExtractNumberK(1)(BIRTH_DATE)_1
0,"JUL 24, 1995",24.0,1995.0
1,"FEB 18, 1993",18.0,1993.0
2,"OCT 27, 1997",27.0,1997.0
3,"MAR 06, 1995",6.0,1995.0
4,"FEB 28, 1994",28.0,1994.0


## semantic

In [32]:
from avatar.transformations.semantic import TimeFeatures

In [31]:
TimeFeatures()(nba.BIRTH_DATE)

Unnamed: 0,A,w,-d,B,-m,-y,Y,-H,-I,p,-M,-S,f,z,Z,-j,U,W
0,Monday,1,24,July,7,95,1995,0,12,AM,0,0,0,,,205,30,30
1,Thursday,4,18,February,2,93,1993,0,12,AM,0,0,0,,,49,7,7
2,Monday,1,27,October,10,97,1997,0,12,AM,0,0,0,,,300,43,43
3,Monday,1,6,March,3,95,1995,0,12,AM,0,0,0,,,65,10,10
4,Monday,1,28,February,2,94,1994,0,12,AM,0,0,0,,,59,9,9
5,Saturday,6,2,October,10,82,1982,0,12,AM,0,0,0,,,275,39,39
6,Wednesday,3,5,September,9,90,1990,0,12,AM,0,0,0,,,248,35,36
7,Tuesday,2,19,January,1,88,1988,0,12,AM,0,0,0,,,19,3,3
8,Saturday,6,22,February,2,86,1986,0,12,AM,0,0,0,,,53,7,7
9,Sunday,0,9,April,4,95,1995,0,12,AM,0,0,0,,,99,15,14


In [18]:
import dateparser

In [20]:
dateparser.parse('0')

In [22]:
dateparser.parse('test') == None

True

In [24]:
nba.BIRTH_DATE.head(100)

0     JUL 24, 1995
1     FEB 18, 1993
2     OCT 27, 1997
3     MAR 06, 1995
4     FEB 28, 1994
5     OCT 02, 1982
6     SEP 05, 1990
7     JAN 19, 1988
8     FEB 22, 1986
9     APR 09, 1995
10    SEP 02, 1997
11    APR 26, 1997
12    NOV 08, 1999
13    MAY 22, 1995
14    DEC 30, 1984
15    JUL 01, 1991
16    MAR 16, 1991
Name: BIRTH_DATE, dtype: object