# Feature Extraction and Transformation

### TF-IDF 

In [4]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import HashingTF, IDF, Tokenizer 
spark = SparkSession.builder.appName("TF-IDF Example").getOrCreate() 
data = [ (0, "This is the first document"),   (1, "This document is the second document"), 
    (2, "And this is the third one"),   (3, "Is this the first document?"), 
    (4, "The last document is the fifth one") ] 
df = spark.createDataFrame(data, ["id", "text"]) 
df.show(truncate=False)

+---+------------------------------------+
|id |text                                |
+---+------------------------------------+
|0  |This is the first document          |
|1  |This document is the second document|
|2  |And this is the third one           |
|3  |Is this the first document?         |
|4  |The last document is the fifth one  |
+---+------------------------------------+



In [7]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
words_df = tokenizer.transform(df) 
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=15) 
featurized_df = hashingTF.transform(words_df) 
idf = IDF(inputCol="rawFeatures", outputCol="features") 
idf_model = idf.fit(featurized_df) 
tfidf_df = idf_model.transform(featurized_df) 
result_df = tfidf_df.select("id", "text", "features") 
print(result_df.show(truncate=False,vertical=False)) 

+---+------------------------------------+--------------------------------------------------------------------------------------------------------+
|id |text                                |features                                                                                                |
+---+------------------------------------+--------------------------------------------------------------------------------------------------------+
|0  |This is the first document          |(15,[2,3,4,10,13],[0.0,0.1823215567939546,0.0,0.4054651081081644,0.6931471805599453])                   |
|1  |This document is the second document|(15,[2,3,4,6,10],[0.0,0.1823215567939546,0.0,0.6931471805599453,0.8109302162163288])                    |
|2  |And this is the third one           |(15,[0,1,2,3,4,6],[0.6931471805599453,0.6931471805599453,0.0,0.1823215567939546,0.0,0.6931471805599453])|
|3  |Is this the first document?         |(15,[1,2,3,4,13],[0.6931471805599453,0.0,0.1823215567939546,0.0,0.6931

### Word2Vec 

In [11]:
# Sample data (documents with tokenized words) 
from pyspark.ml.feature import Word2Vec, Tokenizer 
data = [     (0, ["apple", "banana", "orange", "grape"]),     (1, ["apple", "banana", "cherry", "pear"]), 
    (2, ["banana", "cherry", "grape", "kiwi"]),     (3, ["apple", "pear", "kiwi", "orange"]), 
   (4, ["cherry", "grape", "kiwi", "orange"]) ] 
# Create a DataFrame from the sample data 
df = spark.createDataFrame(data, ["id", "words"]) 
df.show(truncate=False)

+---+------------------------------+
|id |words                         |
+---+------------------------------+
|0  |[apple, banana, orange, grape]|
|1  |[apple, banana, cherry, pear] |
|2  |[banana, cherry, grape, kiwi] |
|3  |[apple, pear, kiwi, orange]   |
|4  |[cherry, grape, kiwi, orange] |
+---+------------------------------+



In [12]:
word2vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words", outputCol="features") 
model = word2vec.fit(df) 
result = model.transform(df) 
result.select("id", "features").show(truncate=False) 

+---+-----------------------------------------------------------------+
|id |features                                                         |
+---+-----------------------------------------------------------------+
|0  |[0.06720347004011273,0.051968781277537346,0.0366170909255743]    |
|1  |[-8.117086254060268E-4,0.10535624250769615,-0.015570234507322311]|
|2  |[-0.014742250088602304,0.021969905123114586,0.06884750723838806] |
|3  |[0.06535391230136156,0.12924792431294918,-0.022874346002936363]  |
|4  |[0.02541335765272379,0.046014076098799706,0.07840530946850777]   |
+---+-----------------------------------------------------------------+



### CountVectorizer 

In [16]:
data = [  (0, ["apple", "banana", "orange", "grape"]), (1, ["apple", "banana", "cherry", "pear"]), 
    (2, ["banana", "cherry", "grape", "kiwi"]), (3, ["apple", "pear", "kiwi", "orange"]), 
    (4, ["cherry", "grape", "kiwi", "orange"]) ] 
df = spark.createDataFrame(data, ["id", "words"]) 
print(df.show(truncate=False)) 

+---+------------------------------+
|id |words                         |
+---+------------------------------+
|0  |[apple, banana, orange, grape]|
|1  |[apple, banana, cherry, pear] |
|2  |[banana, cherry, grape, kiwi] |
|3  |[apple, pear, kiwi, orange]   |
|4  |[cherry, grape, kiwi, orange] |
+---+------------------------------+

None


In [17]:
from pyspark.ml.feature import CountVectorizer, Tokenizer 
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=7) 
model = cv.fit(df) 
result = model.transform(df) 
result.select("id", "features").show(truncate=False) 

+---+-------------------------------+
|id |features                       |
+---+-------------------------------+
|0  |(7,[0,1,4,5],[1.0,1.0,1.0,1.0])|
|1  |(7,[0,2,5,6],[1.0,1.0,1.0,1.0])|
|2  |(7,[0,1,2,3],[1.0,1.0,1.0,1.0])|
|3  |(7,[3,4,5,6],[1.0,1.0,1.0,1.0])|
|4  |(7,[1,2,3,4],[1.0,1.0,1.0,1.0])|
+---+-------------------------------+



### FeatureHasher 

In [14]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import FeatureHasher, Tokenizer 
spark = SparkSession.builder.appName("FeatureHasher Example").getOrCreate() 
data = [ (0, "apple", "banana", "orange"), (1, "apple", "banana", "cherry"),  (2, "banana", "cherry", "grape"), 
    (3, "apple", "pear", "kiwi"),  (4, "cherry", "grape", "kiwi") ] 
df = spark.createDataFrame(data, ["id", "feature1", "feature2", "feature3"]) 
print(df.show(truncate=False)) 

+---+--------+--------+--------+
|id |feature1|feature2|feature3|
+---+--------+--------+--------+
|0  |apple   |banana  |orange  |
|1  |apple   |banana  |cherry  |
|2  |banana  |cherry  |grape   |
|3  |apple   |pear    |kiwi    |
|4  |cherry  |grape   |kiwi    |
+---+--------+--------+--------+

None


In [15]:
hasher = FeatureHasher(inputCols=["feature1", "feature2", "feature3"],outputCol="features", numFeatures=10) 
result = hasher.transform(df) 
print(result.show(truncate=False)) 

+---+--------+--------+--------+--------------------------+
|id |feature1|feature2|feature3|features                  |
+---+--------+--------+--------+--------------------------+
|0  |apple   |banana  |orange  |(10,[3,7,9],[1.0,1.0,1.0])|
|1  |apple   |banana  |cherry  |(10,[3,6,7],[1.0,1.0,1.0])|
|2  |banana  |cherry  |grape   |(10,[0,4,7],[1.0,1.0,1.0])|
|3  |apple   |pear    |kiwi    |(10,[0,7,8],[1.0,1.0,1.0])|
|4  |cherry  |grape   |kiwi    |(10,[0,2,6],[1.0,1.0,1.0])|
+---+--------+--------+--------+--------------------------+

None


### VectorAssembler 

In [18]:
from pyspark.ml.feature import VectorAssembler 
data = [ (1, 2, 3, 4), (5, 6, 7, 8), (9, 10, 11, 12) ] 
columns = ["feature1", "feature2", "feature3", "feature4"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+--------+--------+--------+--------+
|feature1|feature2|feature3|feature4|
+--------+--------+--------+--------+
|       1|       2|       3|       4|
|       5|       6|       7|       8|
|       9|      10|      11|      12|
+--------+--------+--------+--------+



In [19]:
assembler = VectorAssembler(inputCols=columns, outputCol="features") 
output_df = assembler.transform(df) 
output_df.show() 

+--------+--------+--------+--------+--------------------+
|feature1|feature2|feature3|feature4|            features|
+--------+--------+--------+--------+--------------------+
|       1|       2|       3|       4|   [1.0,2.0,3.0,4.0]|
|       5|       6|       7|       8|   [5.0,6.0,7.0,8.0]|
|       9|      10|      11|      12|[9.0,10.0,11.0,12.0]|
+--------+--------+--------+--------+--------------------+



### StringIndexer 

In [20]:
from pyspark.ml.feature import StringIndexer 
data = [("A", 10), ("A", 20), ("B", 30), ("B", 20), ("B", 30), ("C", 40), ("C", 10), ("D", 10)] 
columns = ["Categories", "Value"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+----------+-----+
|Categories|Value|
+----------+-----+
|         A|   10|
|         A|   20|
|         B|   30|
|         B|   20|
|         B|   30|
|         C|   40|
|         C|   10|
|         D|   10|
+----------+-----+



In [22]:
indexer = StringIndexer(inputCol="Categories", outputCol="Categories_Indexed") 
indexerModel = indexer.fit(df) 
indexed_df = indexerModel.transform(df) 
indexed_df.show() 

+----------+-----+------------------+
|Categories|Value|Categories_Indexed|
+----------+-----+------------------+
|         A|   10|               1.0|
|         A|   20|               1.0|
|         B|   30|               0.0|
|         B|   20|               0.0|
|         B|   30|               0.0|
|         C|   40|               2.0|
|         C|   10|               2.0|
|         D|   10|               3.0|
+----------+-----+------------------+



### OneHotEncoder 

In [23]:
from pyspark.ml.feature import OneHotEncoder 
data = [(0.0, 1.0), (1.0, 0.0), (2.0, 1.0)] 
columns = ["input1", "input2"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+------+------+
|input1|input2|
+------+------+
|   0.0|   1.0|
|   1.0|   0.0|
|   2.0|   1.0|
+------+------+



In [24]:
encoder = OneHotEncoder(inputCols=["input1", "input2"], outputCols=["output1", "output2"]) 
encoded_df = encoder.fit(df) 
encoded_df = encoded_df.transform(df) 
encoded_df.select("output1","output2").show(truncate=False) 

+-------------+-------------+
|output1      |output2      |
+-------------+-------------+
|(2,[0],[1.0])|(1,[],[])    |
|(2,[1],[1.0])|(1,[0],[1.0])|
|(2,[],[])    |(1,[],[])    |
+-------------+-------------+



### Tokenizer and RegexTokenizer 

In [25]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer 
spark = SparkSession.builder.appName("TokenizerExample").getOrCreate() 
data = [("Th+is is a sam+ple sent+ence.",)] 
columns = ["text"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+--------------------+
|                text|
+--------------------+
|Th+is is a sam+pl...|
+--------------------+



In [26]:
tokenizer = Tokenizer(inputCol="text", outputCol="tokens") 
tokenized_df = tokenizer.transform(df) 
regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="regex_tokens", pattern="\\+") 
regex_tokenized_df = regex_tokenizer.transform(df) 
tokenized_df.select("tokens").show(truncate=False) 
regex_tokenized_df.select("regex_tokens").show(truncate=False) 

+-----------------------------------+
|tokens                             |
+-----------------------------------+
|[th+is, is, a, sam+ple, sent+ence.]|
+-----------------------------------+

+----------------------------------+
|regex_tokens                      |
+----------------------------------+
|[th, is is a sam, ple sent, ence.]|
+----------------------------------+



### StopWordsRemover 

In [28]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import StopWordsRemover, Tokenizer 
data = [("This is the first sentence.",), ("And here's another sentence.",),  ("A third sentence for the DataFrame.",)] 
columns = ["text"] 
df = spark.createDataFrame(data, columns) 
tokenizer = Tokenizer(inputCol="text", outputCol="words") 
df = tokenizer.transform(df) 
df.show(truncate=False)

+-----------------------------------+------------------------------------------+
|text                               |words                                     |
+-----------------------------------+------------------------------------------+
|This is the first sentence.        |[this, is, the, first, sentence.]         |
|And here's another sentence.       |[and, here's, another, sentence.]         |
|A third sentence for the DataFrame.|[a, third, sentence, for, the, dataframe.]|
+-----------------------------------+------------------------------------------+



In [29]:
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words") 
filtered_df = stopwords_remover.transform(df) 
filtered_df.select("filtered_words").show(truncate=False) 

+-----------------------------+
|filtered_words               |
+-----------------------------+
|[first, sentence.]           |
|[another, sentence.]         |
|[third, sentence, dataframe.]|
+-----------------------------+



### Bucketizer 

In [30]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import Bucketizer 
from pyspark.sql.functions import col 
data = [(0, 1.5), (1, 2.5), (2, 3.5), (3, 4.5), (4, 5.5)] 
columns = ["id", "value"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+---+-----+
| id|value|
+---+-----+
|  0|  1.5|
|  1|  2.5|
|  2|  3.5|
|  3|  4.5|
|  4|  5.5|
+---+-----+



In [31]:
splits = [0.0, 2.0, 4.0, float("inf")] 
bucketizer = Bucketizer(splits=splits, inputCol="value", outputCol="bucket") 
bucketized_df = bucketizer.transform(df) 
bucketized_df.select("id", "value", "bucket").show() 

+---+-----+------+
| id|value|bucket|
+---+-----+------+
|  0|  1.5|   0.0|
|  1|  2.5|   1.0|
|  2|  3.5|   1.0|
|  3|  4.5|   2.0|
|  4|  5.5|   2.0|
+---+-----+------+



### StandardScaler 

In [32]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import StandardScaler 
from pyspark.ml.linalg import Vectors 
data = [(0, Vectors.dense([1.0, 0.1, -1.0]),), (1, Vectors.dense([2.0, 1.1, 1.0]),), 
        (2, Vectors.dense([3.0, 10.1, 3.0]),)] 
columns = ["id", "features"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+---+--------------+
| id|      features|
+---+--------------+
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  2|[3.0,10.1,3.0]|
+---+--------------+



In [33]:
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True) 
scaler_model = scaler.fit(df) 
scaled_df = scaler_model.transform(df) 
scaled_df.select("id", "features", "scaled_features").show(truncate=False) 

+---+--------------+-------------------------------+
|id |features      |scaled_features                |
+---+--------------+-------------------------------+
|0  |[1.0,0.1,-1.0]|[-1.0,-0.6657502859356826,-1.0]|
|1  |[2.0,1.1,1.0] |[0.0,-0.4841820261350419,0.0]  |
|2  |[3.0,10.1,3.0]|[1.0,1.1499323120707245,1.0]   |
+---+--------------+-------------------------------+



### MinMaxScaler 

In [34]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import MinMaxScaler 
from pyspark.ml.linalg import Vectors 
data = [(0, Vectors.dense([1.0, 0.1, -1.0]),), 
        (1, Vectors.dense([2.0, 1.1, 1.0]),), 
        (2, Vectors.dense([3.0, 10.1, 3.0]),)] 
columns = ["id", "features"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+---+--------------+
| id|      features|
+---+--------------+
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  2|[3.0,10.1,3.0]|
+---+--------------+



In [35]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features") 
scaler_model = scaler.fit(df) 
scaled_df = scaler_model.transform(df) 
scaled_df.select("id", "features", "scaled_features").show(truncate=False) 

+---+--------------+---------------+
|id |features      |scaled_features|
+---+--------------+---------------+
|0  |[1.0,0.1,-1.0]|(3,[],[])      |
|1  |[2.0,1.1,1.0] |[0.5,0.1,0.5]  |
|2  |[3.0,10.1,3.0]|[1.0,1.0,1.0]  |
+---+--------------+---------------+



### Normalizer 

In [36]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import Normalizer 
from pyspark.ml.linalg import Vectors 
data = [(0, Vectors.dense([1.0, 0.1, -1.0]),), 
        (1, Vectors.dense([2.0, 1.1, 1.0]),), 
        (2, Vectors.dense([3.0, 10.1, 3.0]),)] 
columns = ["id", "features"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+---+--------------+
| id|      features|
+---+--------------+
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  2|[3.0,10.1,3.0]|
+---+--------------+



In [37]:
normalizer = Normalizer(inputCol="features", outputCol="normalized_features", p=2.0) 
normalized_df = normalizer.transform(df) 
normalized_df.select("id", "features", "normalized_features").show(truncate=False) 

+---+--------------+------------------------------------------------------------+
|id |features      |normalized_features                                         |
+---+--------------+------------------------------------------------------------+
|0  |[1.0,0.1,-1.0]|[0.7053456158585983,0.07053456158585983,-0.7053456158585983]|
|1  |[2.0,1.1,1.0] |[0.8025723539051279,0.4414147946478204,0.40128617695256397] |
|2  |[3.0,10.1,3.0]|[0.27384986857909926,0.9219612242163009,0.27384986857909926]|
+---+--------------+------------------------------------------------------------+



### PCA (Principal Component Analysis) 

In [38]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import PCA 
from pyspark.ml.linalg import Vectors 
data = [(0, Vectors.dense([1.0, 0.1, -1.0])), 
        (1, Vectors.dense([2.0, 1.1, 1.0])), 
        (2, Vectors.dense([3.0, 10.1, 3.0]))] 
columns = ["id", "features"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+---+--------------+
| id|      features|
+---+--------------+
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  2|[3.0,10.1,3.0]|
+---+--------------+



In [39]:
pca = PCA(k=2, inputCol="features", outputCol="pca_features") 
pca_model = pca.fit(df) 
pca_df = pca_model.transform(df) 
pca_df.select("id", "pca_features").show(truncate=False) 

+---+------------------------------------------+
|id |pca_features                              |
+---+------------------------------------------+
|0  |[0.06466700238304013,-0.45367188451874657]|
|1  |[-1.6616789696362084,1.284065030233573]   |
|2  |[-10.870750062210382,0.19181523649833387] |
+---+------------------------------------------+



### PolynomialExpansion 

In [40]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import PolynomialExpansion 
from pyspark.ml.linalg import Vectors 
data = [(0, Vectors.dense([1.0, 2.0])), (1, Vectors.dense([2.0, 3.0])), (2, Vectors.dense([3.0, 4.0]))] 
columns = ["id", "features"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+---+---------+
| id| features|
+---+---------+
|  0|[1.0,2.0]|
|  1|[2.0,3.0]|
|  2|[3.0,4.0]|
+---+---------+



In [41]:
poly_expansion = PolynomialExpansion(inputCol="features", outputCol="expanded_features", degree=2) 
expanded_df = poly_expansion.transform(df) 
expanded_df.select("id", "expanded_features").show(truncate=False) 

+---+-----------------------+
|id |expanded_features      |
+---+-----------------------+
|0  |[1.0,1.0,2.0,2.0,4.0]  |
|1  |[2.0,4.0,3.0,6.0,9.0]  |
|2  |[3.0,9.0,4.0,12.0,16.0]|
+---+-----------------------+



### Chi-Squared Selector 

In [42]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import ChiSqSelector 
from pyspark.ml.linalg import Vectors 
from pyspark.sql.functions import col 
data = [(0, Vectors.dense([1.0, 0.1, -1.0]), 1.0), 
        (1, Vectors.dense([2.0, 1.1, 1.0]), 0.0), 
        (2, Vectors.dense([3.0, 10.1, 3.0]), 0.0)] 
columns = ["id", "features", "label"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+---+--------------+-----+
| id|      features|label|
+---+--------------+-----+
|  0|[1.0,0.1,-1.0]|  1.0|
|  1| [2.0,1.1,1.0]|  0.0|
|  2|[3.0,10.1,3.0]|  0.0|
+---+--------------+-----+



In [43]:
selector = ChiSqSelector(numTopFeatures=1, featuresCol="features", outputCol="selected_features", labelCol="label") 
selector_model = selector.fit(df) 
selected_df = selector_model.transform(df) 
selected_df.select("id", "selected_features").show(truncate=False) 

+---+-----------------+
|id |selected_features|
+---+-----------------+
|0  |[1.0]            |
|1  |[2.0]            |
|2  |[3.0]            |
+---+-----------------+



### Vector Slicer 

In [44]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import VectorSlicer 
from pyspark.ml.linalg import Vectors 
data = [(0, Vectors.dense([1.0, 2.0, 3.0, 4.0, 5.0])), 
        (1, Vectors.dense([2.0, 3.0, 4.0, 5.0, 6.0])), 
        (2, Vectors.dense([3.0, 4.0, 5.0, 6.0, 7.0]))] 
columns = ["id", "features"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+---+--------------------+
| id|            features|
+---+--------------------+
|  0|[1.0,2.0,3.0,4.0,...|
|  1|[2.0,3.0,4.0,5.0,...|
|  2|[3.0,4.0,5.0,6.0,...|
+---+--------------------+



In [45]:
slicer = VectorSlicer(inputCol="features", outputCol="selected_features", indices=[1, 3, 4]) 
sliced_df = slicer.transform(df) 
sliced_df.select("id", "selected_features").show(truncate=False) 

+---+-----------------+
|id |selected_features|
+---+-----------------+
|0  |[2.0,4.0,5.0]    |
|1  |[3.0,5.0,6.0]    |
|2  |[4.0,6.0,7.0]    |
+---+-----------------+



### RFormula 

In [46]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import RFormula 
spark = SparkSession.builder.appName("RFormulaExample").getOrCreate() 
data = [(1.0, 1.0, "a"), 
        (0.0, 2.0, "b"), 
        (0.0, 0.0, "a")] 
columns = ["y", "x", "s"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+---+---+---+
|  y|  x|  s|
+---+---+---+
|1.0|1.0|  a|
|0.0|2.0|  b|
|0.0|0.0|  a|
+---+---+---+



In [47]:
rf = RFormula(formula="y ~ x + s") 
model = rf.fit(df) 
transformed_df = model.transform(df) 
transformed_df.select("y", "x", "s", "features", "label").show(truncate=False) 

+---+---+---+---------+-----+
|y  |x  |s  |features |label|
+---+---+---+---------+-----+
|1.0|1.0|a  |[1.0,1.0]|1.0  |
|0.0|2.0|b  |[2.0,0.0]|0.0  |
|0.0|0.0|a  |[0.0,1.0]|0.0  |
+---+---+---+---------+-----+



### UnivariateFeatureSelector 

In [48]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import UnivariateFeatureSelector 
from pyspark.ml.linalg import Vectors 
data = [(1.0, Vectors.dense([1.0, 0.1, -1.0])), 
        (0.0, Vectors.dense([2.0, 1.1, 1.0])), 
        (0.0, Vectors.dense([3.0, 10.1, 3.0]))] 
columns = ["label", "features"] 
df = spark.createDataFrame(data, columns) 
df.show() 

+-----+--------------+
|label|      features|
+-----+--------------+
|  1.0|[1.0,0.1,-1.0]|
|  0.0| [2.0,1.1,1.0]|
|  0.0|[3.0,10.1,3.0]|
+-----+--------------+



In [49]:
selector = UnivariateFeatureSelector(featuresCol="features", outputCol="selected_features") 
selector.setFeatureType("continuous").setLabelType("categorical").setSelectionThreshold(1) 
selected_df = selector.fit(df).transform(df) 
selected_df.select("label", "selected_features").show(truncate=False) 

+-----+-----------------+
|label|selected_features|
+-----+-----------------+
|1.0  |[1.0]            |
|0.0  |[2.0]            |
|0.0  |[3.0]            |
+-----+-----------------+

