## Getting imports which are required

In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd

## Sparkora initialization

In [0]:
from Sparkora import Sparkora

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.2.0
      /_/

Using Python version 3.8.10 (default, Sep 28 2021 16:10:42)
Spark context Web UI available at http://10.172.203.167:48830
Spark context available as 'sc' (master = local[8], app id = local-1636888245786).
SparkSession available as 'spark'.


## Creating the spark sample dataframe

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/Spratiher9/Sparkora/0.0.1/Sparkora/test_data.csv?token=AQWZISDC747WWFPLVRUCLXDBSDZIS")
df = spark.createDataFrame(df)

In [0]:
df.display()

A,B,C,D,useless_feature
1,2.0,0,left,1
4,,1,right,1
7,8.0,2,left,1


# Demos from README

## Configuring `Sparkora` with the sample data

In [0]:
sparkora = Sparkora()
sparkora.configure(output = 'A', data = df)

In [0]:
display(sparkora.data)

A,B,C,D,useless_feature
1,2.0,0,left,1
4,,1,right,1
7,8.0,2,left,1


## Cleaning

In [0]:
d = [
  (1, 2, 100),
  (2, None, 200),
  (1, 6, None)
]
df = spark.createDataFrame(d, "_c0 int, _c1 int, _c2 int")

In [0]:
sparkora = Sparkora(output = '_c0', data = df)
sparkora.data.display()

_c0,_c1,_c2
1,2.0,100.0
2,,200.0
1,6.0,


### Imputation

In [0]:
sparkora.impute_missing_values() # Imputation strategy can be passed here: mean, median, mode
sparkora.data.display()

_c0,_c1,_c2
1,2,100
2,4,200
1,6,150


### Scaling the dataset

In [0]:
sparkora.scale_input_values()
sparkora.data.display()

_c0,_c1,_c2
1,0.0,0.0
2,0.5,1.0
1,1.0,0.5


## Feature Selection & Extraction

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/Spratiher9/Sparkora/0.0.1/Sparkora/test_data.csv?token=AQWZISDC747WWFPLVRUCLXDBSDZIS")
df = spark.createDataFrame(df)
df.display()

A,B,C,D,useless_feature
1,2.0,0,left,1
4,,1,right,1
7,8.0,2,left,1


In [0]:
sparkora = Sparkora()
sparkora.configure(output = 'A', data = df)
sparkora.data.display()

A,B,C,D,useless_feature
1,2.0,0,left,1
4,,1,right,1
7,8.0,2,left,1


### Removing feature

In [0]:
sparkora.remove_feature('useless_feature')
sparkora.data.display()

A,B,C,D
1,2.0,0,left
4,,1,right
7,8.0,2,left


### Extracting an ordinal feature through one-hot encoding

In [0]:
sparkora.extract_ordinal_feature('D')
sparkora.data.display()

A,B,C,D=left,D=right
1,2.0,0,1,0
4,,1,0,1
7,8.0,2,1,0


### Feature creation from existing feature(s)

#### single feature input

In [0]:
f_udf = F.udf(lambda x: x * 2, T.IntegerType())
sparkora.extract_feature('C', 'twoC', f_udf)
sparkora.data.display()

A,B,C,D=left,D=right,twoC
1,2.0,0,1,0,0
4,,1,0,1,2
7,8.0,2,1,0,4


#### multi feature input

In [0]:
f_udf = F.udf(lambda x,y: x * y, T.IntegerType())
sparkora.extract_feature(['C','A'], 'newC', f_udf)
sparkora.data.display()

A,B,C,D=left,D=right,twoC,newC
1,2.0,0,1,0,0,0
4,,1,0,1,2,4
7,8.0,2,1,0,4,14


## Visualizing feature against output feature

In [0]:
sparkora.plot_feature('newC')

newC,A
0,1
4,4
14,7


## Model Validation

In [0]:
sparkora.set_training_and_validation(0.8)

In [0]:
X = sparkora.training_data.select(sparkora.input_columns())
y = sparkora.training_data.select(sparkora.output)
X.display()
y.display()

B,C,D=left,D=right,twoC,newC
2.0,0,1,0,0,0
,1,0,1,2,4
8.0,2,1,0,4,14


A
1
4
7


In [0]:
X = sparkora.validating_data.select(sparkora.input_columns())
y = sparkora.validating_data.select(sparkora.output)
X.display()
y.display()

B,C,D=left,D=right,twoC,newC


A


## Versioning tranformations and states and logging the steps in each state

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/Spratiher9/Sparkora/0.0.1/Sparkora/test_data.csv?token=AQWZISCH3QUFGHB653KPVBTBSD2XO")
df = spark.createDataFrame(df)
df.display()

A,B,C,D,useless_feature
1,2.0,0,left,1
4,,1,right,1
7,8.0,2,left,1


In [0]:
sparkora = Sparkora()
sparkora.configure(output = 'A', data = df)
sparkora.data.display()

A,B,C,D,useless_feature
1,2.0,0,left,1
4,,1,right,1
7,8.0,2,left,1


In [0]:
sparkora.snapshot('initial_data')

In [0]:
sparkora.remove_feature('useless_feature')
sparkora.extract_ordinal_feature('D')
sparkora.impute_missing_values()
sparkora.scale_input_values()
sparkora.data.display()

A,B,C,D=left,D=right
1,0.0,0,1.0,0.0
4,0.5,1,0.0,1.0
7,1.0,2,1.0,0.0


In [0]:
sparkora.logs

Out[39]: ["self.remove_feature('useless_feature')",
 "self.extract_ordinal_feature('D')",
 'self.impute_missing_values(mean)',
 'self.scale_input_values()']

In [0]:
sparkora.snapshot('transform1')

In [0]:
sparkora.use_snapshot('initial_data')
sparkora.data.display()

A,B,C,D,useless_feature
1,2.0,0,left,1
4,,1,right,1
7,8.0,2,left,1


In [0]:
sparkora.logs

Out[42]: ["self.snapshot('initial_data')"]

In [0]:
sparkora.use_snapshot('transform1')
sparkora.data.display()

A,B,C,D=left,D=right
1,0.0,0,1.0,0.0
4,0.5,1,0.0,1.0
7,1.0,2,1.0,0.0


In [0]:
sparkora.logs

Out[44]: ["self.remove_feature('useless_feature')",
 "self.extract_ordinal_feature('D')",
 'self.impute_missing_values(mean)',
 'self.scale_input_values()',
 "self.snapshot('transform1')"]