## Imports
<p>Pandas and random are external libraries used.  cmlmaker is the library to build CML structure</p>

In [1]:
from cmlmaker import structure,inobj,outobj, oneHotEncoding,normalize, replaceValue,ops
import pandas as pd
import random

## Creating fake structured data

In [2]:
l=20
size=75;msize=25;tsize=msize+size
size2=123.45
df=pd.DataFrame(
    data={
        "one":[random.choice(['married','not married']) for r in range(l)],
        "two":[25+random.random()*size for r in range(l)],
        "three":[random.choice(["red","blue","green","yellow"]) for r in range(l)],
        "four":[random.randint(0,1) for r in range(l)],
        "five":[random.random()*size2 for r in range(l)]
    }
)
df

Unnamed: 0,one,two,three,four,five
0,not married,75.927883,red,1,46.088701
1,married,28.272705,blue,0,26.328188
2,married,82.223517,blue,0,83.711239
3,married,66.269656,yellow,0,115.771886
4,not married,58.25386,yellow,0,16.887458
5,married,29.890392,yellow,0,21.27416
6,married,59.646717,red,1,50.465497
7,not married,48.952579,blue,1,94.156436
8,not married,54.401495,green,0,99.842336
9,married,74.825914,blue,1,17.035407


### Initializing a CML structure

In [3]:
cml=structure("structuredClean","Cleaning some basic structured data")

<p>Next step is to define the data that is coming in (the fake data created above).  Since it is a dataframe we are inputting the type will be "map" and we can label the data how we like, lets call it "df".</p>

<p>To add an input to your initialized structure use the method addInput.  If we use the python command `help` on cml.addInput we can see that the input is an inobj object.  Then using `help` on the inobj object show that the type and label are required to initialize the inobj.</p>

In [4]:
help(cml.addInput)
help(inobj)

Help on method addInput in module cmlmaker:

addInput(inobj) method of cmlmaker.structure instance

Help on class inobj in module cmlmaker:

class inobj(builtins.object)
 |  the structure of an input object has type,label, dim, and shape
 |  type (1st) and label(2nd) are required with dim and shape optional
 |  
 |  Methods defined here:
 |  
 |  __init__(self, typ, label, dim=None, shape=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  make_map(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



<p>Therefore we can add the input with the following commands.</p>

In [5]:
typ="map";label="df"
cml=cml.addInput(inobj(typ,label))

### Operations
<p>Now we need to add the operations that will transform the data.</p>

#### oneHotEncoding

In [None]:
#Perfroming a One Hot Encoding for 5 classes (only 4 in the input data) on the data
colors=["red","blue","green","yellow","purple"]
for color in colors:
    df.loc[:,color]=0
    df.loc[df['three']==color,color]=1
df=df.drop('three',axis=1)

In [15]:
#Show the values needed for oneHotEncoding's inputs and params objects
help(oneHotEncoding)

Help on class oneHotEncoding in module cmlmaker:

class oneHotEncoding(operation)
 |  convert categorical vector into a set of vectors for each category with a 0/1
 |  
 |  Method resolution order:
 |      oneHotEncoding
 |      operation
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, inputs=None, params=None, output=None)
 |      Initialize oneHotEncoding operation and define inputs, parameters, and outputs
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  inputs = <class 'cmlmaker.oneHotEncoding.inputs'>
 |      inputs's possible keys:
 |              data - 2D table to be converted to map
 |  
 |  params = <class 'cmlmaker.oneHotEncoding.params'>
 |      params's possible keys:
 |              inputColumns - the columns to which one Hot Encodding should be applied
 |              outputColumns - list of keys for map that correspond to 0 to n columns in table
 |         

In [6]:
#Adding the operation to the CML
cml=cml.addOps(
    ops.cleaning.oneHotEncoding(                                                        #operation we are adding
        oneHotEncoding.inputs("$df"),                                                   #OHE input object being included in op
        oneHotEncoding.params(inputColumns="three",outputColumns=colors,keepOrig=True), #OHE params object being included in op
        output="df"                                                                     #output label for resulting data
    )
)

#### Replacing values
<p>Column 'one' has two catagories married and not married.  Since this is effectively a boolean catagoization we can substitute 1 for married, and 0 for not married.</p>

In [16]:
df['one']=df['one'].apply(lambda x: 1 if x=="married" else 0)

In [8]:
df.loc[:,'two']=(df.loc[:,'two']-msize)/(tsize-msize)

df.loc[:,'five']=df.loc[:,'five']/size2

df

Unnamed: 0,one,two,four,five,red,blue,green,yellow,purple
0,0,0.679038,1,0.373339,1,0,0,0,0
1,1,0.043636,0,0.21327,0,1,0,0,0
2,1,0.76298,0,0.678098,0,1,0,0,0
3,1,0.550262,0,0.937804,0,0,0,1,0
4,0,0.443385,0,0.136796,0,0,0,1,0
5,1,0.065205,0,0.17233,0,0,0,1,0
6,1,0.461956,1,0.408793,1,0,0,0,0
7,0,0.319368,1,0.762709,0,1,0,0,0
8,0,0.39202,0,0.808767,0,0,1,0,0
9,1,0.664346,1,0.137994,0,1,0,0,0


In [9]:
help(ops)

Help on class ops in module cmlmaker:

class ops(builtins.object)
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  cleaning = <class 'cmlmaker.ops.cleaning'>
 |      Operations that fall best under data cleaning
 |  
 |  image_processing = <class 'cmlmaker.ops.image_processing'>
 |      Image processing related operations
 |  
 |  math = <class 'cmlmaker.ops.math'>
 |      Math based operations
 |  
 |  nlp = <class 'cmlmaker.ops.nlp'>
 |      Natural Language Processing(NLP) related operations
 |  
 |  restructuring = <class 'cmlmaker.ops.restructuring'>
 |      Operations that restructure data (pivot, join, etc.)
 |  
 |  string_processing = <class 'cmlmaker.ops.string_processing'>
 |      Operations that act on stri

In [10]:
help(ops.cleaning)

Help on class cleaning in module cmlmaker:

class cleaning(builtins.object)
 |  Operations that fall best under data cleaning
 |  
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  apply = <class 'cmlmaker.ops.cleaning.apply'>
 |      apply a function to every value in a vector or key in a map
 |  
 |  concatMap = <class 'cmlmaker.ops.cleaning.concatMap'>
 |      takes an array of maps and combines them into one.
 |  
 |  ifin = <class 'cmlmaker.ops.cleaning.ifin'>
 |      Given 2 arrays returns the new array with the elements of the first array only if they appear in the second array as well.
 |  
 |  ifnotin = <class 'cmlmaker.ops.cleaning.ifnotin'>
 |      Given 2 arrays returns the new array with the elements of the

In [11]:
help(ops.cleaning.oneHotEncoding)

Help on class oneHotEncoding in module cmlmaker:

class oneHotEncoding(oneHotEncoding)
 |  convert categorical vector into a set of vectors for each category with a 0/1
 |  
 |  Method resolution order:
 |      oneHotEncoding
 |      oneHotEncoding
 |      operation
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, inputs=None, params=None, output=None)
 |      Initialize oneHotEncoding operation and define inputs, parameters, and outputs
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from oneHotEncoding:
 |  
 |  inputs = <class 'cmlmaker.oneHotEncoding.inputs'>
 |      inputs's possible keys:
 |              data - 2D table to be converted to map
 |  
 |  params = <class 'cmlmaker.oneHotEncoding.params'>
 |      params's possible keys:
 |              inputColumns - the columns to which one Hot Encodding should be applied
 |              outputColumns - list of keys for map that corr

In [12]:
#creating a base structure with name/description
#cml=structure("test","testing the structure class")

#cml=cml.addInput(inobj("string","paragraph"))
#cml=cml.addOps(ops.math.norm(norm.inputs("$paragraph"),norm.params(1),output="n"))
#cml=cml.addOps(ops.math.norm(norm.inputs("$n"),output="n2"))
#cml=cml.addOps(ops.math.normalize(normalize.inputs("$n",7.6),output="normalize3"))
#cml=cml.addOps(ops.math.scale(scale.inputs("$paragraph",7.6),output="scaler4"))
#cml=cml.addOps(ops.math.mean(mean.inputs("$normalize3"),mean.params(1),output="mean5"))
#help(mean.inputs)
#help(mean.params)
#cml=cml.addOutput(outobj("map",{"Out":"$n2"}))
#
##printing json and list of vars used
#print(cml)
#print("\n")
#print(cml.listVars())

In [13]:
help(structure)

Help on class structure in module cmlmaker:

class structure(builtins.object)
 |  Object that holds (and builds) a cml class
 |  
 |  Methods defined here:
 |  
 |  __init__(self, name='name', description='description', version='0.0.0', createdDate=datetime.datetime(2019, 11, 5, 12, 34, 8, 339563))
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  addInput(self, inobj)
 |  
 |  addOps(self, ops, order=-1)
 |  
 |  addOutput(self, outobj)
 |  
 |  listVars(self)
 |  
 |  make_map(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [14]:
help(reshape)

NameError: name 'reshape' is not defined

In [None]:
help(ops.math.normalize)

In [None]:
cml=structure(name="test2",description="blah")
cml=cml.addInput(inpobj("string","knife"))
cml=cml.addOutput(outobj("string","$out"))

In [None]:
print(cml)

In [None]:
help(outobj)