In [None]:
!pip install tensorflow-transform

Collecting tensorflow-transform
  Downloading tensorflow_transform-1.17.0-py3-none-any.whl.metadata (13 kB)
Collecting apache-beam<3,>=2.53 (from apache-beam[gcp]<3,>=2.53; python_version >= "3.11"->tensorflow-transform)
  Downloading apache_beam-2.66.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pyarrow<11,>=10 (from tensorflow-transform)
  Downloading pyarrow-10.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydot<2,>=1.2 (from tensorflow-transform)
  Downloading pydot-1.4.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting tensorflow<2.18,>=2.17 (from tensorflow-transform)
  Downloading tensorflow-2.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
INFO: pip is looking at multiple versions of tensorflow-transform to determine which version is compatible with other requirements. This could take a while.
Collecting tensorflow-transform
  Downloading tensorflow_transf

In [None]:
import tempfile
import pandas as pd
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.beam.impl as tft_beam

from tensorflow_transform.tf_metadata import schema_utils
from tensorflow_transform.tf_metadata import dataset_metadata

In [None]:
dataset=pd.read_csv("pollution_small.csv")

In [None]:
dataset.head()

Unnamed: 0,Date,pm10,no2,so2,soot
0,1/1/2009,98.67,14.1,44.38,34.81
1,1/2/2009,52.33,14.1,29.75,33.06
2,1/3/2009,74.67,20.5,36.25,39.25
3,1/4/2009,72.0,17.3,46.44,34.38
4,1/5/2009,81.0,25.64,56.56,45.59


In [None]:
features=dataset.drop("Date",axis=1)

In [None]:
features.head()

Unnamed: 0,pm10,no2,so2,soot
0,98.67,14.1,44.38,34.81
1,52.33,14.1,29.75,33.06
2,74.67,20.5,36.25,39.25
3,72.0,17.3,46.44,34.38
4,81.0,25.64,56.56,45.59


In [None]:
dict_features=list(features.to_dict("index").values())

In [None]:
dict_features[:2]

[{'pm10': 98.67, 'no2': 14.1, 'so2': 44.38, 'soot': 34.81},
 {'pm10': 52.33, 'no2': 14.1, 'so2': 29.75, 'soot': 33.06}]

In [None]:
data_metadata=dataset_metadata.DatasetMetadata(
    schema_utils.schema_from_feature_spec({
        "pm10":tf.io.FixedLenFeature([],tf.float32),
        "no2":tf.io.FixedLenFeature([],tf.float32),
        "so2":tf.io.FixedLenFeature([],tf.float32),
        "soot":tf.io.FixedLenFeature([],tf.float32),
    })
)

In [None]:
data_metadata

{'_schema': feature {
  name: "no2"
  presence {
    min_fraction: 1
  }
  shape {
  }
  type: FLOAT
}
feature {
  name: "pm10"
  presence {
    min_fraction: 1
  }
  shape {
  }
  type: FLOAT
}
feature {
  name: "so2"
  presence {
    min_fraction: 1
  }
  shape {
  }
  type: FLOAT
}
feature {
  name: "soot"
  presence {
    min_fraction: 1
  }
  shape {
  }
  type: FLOAT
}
, '_output_record_batches': True}

### Preprocessing Function

In [None]:
def preprocessing_fn(inputs):
  pm10=inputs['no2']
  no2=inputs['no2']
  so2=inputs['so2']
  soot=inputs['soot']

  no2_normalized=no2 - tft.mean(no2)
  so2_normalized=so2 - tft.mean(so2)

  pm10_normalized=tft.scale_to_0_1(pm10)
  soot_normalized=tft.scale_by_min_max(soot)

  return {
      "no2_normalized":no2_normalized,
      "so2_normalized":so2_normalized,
      "pm10_normalized":pm10_normalized,
      "soot_normalized":soot_normalized
  }

In [None]:
def data_transform():
  with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    transformed_dataset,transform_fn=((dict_features,data_metadata) | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn))

  transformed_data,transformed_metadata=transformed_dataset

  for i in range(len(transformed_data)):
    print("Raw:",dict_features[i])
    print("Transformed:",transformed_data[i])

In [None]:
data_transform()





Raw: {'pm10': 98.67, 'no2': 14.1, 'so2': 44.38, 'soot': 34.81}
Transformed: {'no2_normalized': -18.57798194885254, 'pm10_normalized': 0.033990807831287384, 'so2_normalized': 28.85540771484375, 'soot_normalized': 0.283423513174057}
Raw: {'pm10': 52.33, 'no2': 14.1, 'so2': 29.75, 'soot': 33.06}
Transformed: {'no2_normalized': -18.57798194885254, 'pm10_normalized': 0.033990807831287384, 'so2_normalized': 14.225406646728516, 'soot_normalized': 0.26620757579803467}
Raw: {'pm10': 74.67, 'no2': 20.5, 'so2': 36.25, 'soot': 39.25}
Transformed: {'no2_normalized': -12.177982330322266, 'pm10_normalized': 0.0838855654001236, 'so2_normalized': 20.725406646728516, 'soot_normalized': 0.32710281014442444}
Raw: {'pm10': 72.0, 'no2': 17.3, 'so2': 46.44, 'soot': 34.38}
Transformed: {'no2_normalized': -15.377983093261719, 'pm10_normalized': 0.05893817916512489, 'so2_normalized': 30.9154052734375, 'soot_normalized': 0.27919331192970276}
Raw: {'pm10': 81.0, 'no2': 25.64, 'so2': 56.56, 'soot': 45.59}
Transfor