In [1]:
import findspark
from pyspark.sql import SparkSession

findspark.init()
spark = SparkSession\
        .builder\
        .master("local[2]")\
        .appName("Spark MLlib")\
        .getOrCreate()
spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/25 08:24:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.ml.feature import Binarizer
from pyspark.ml import Transformer

df = spark.createDataFrame([(0.5,), (0.1,), (0.6,), (0.9,), (0.0,), (0.7,), (1.2,)], ["values"])
binarizer = Binarizer(threshold=0.5, inputCol="values", outputCol="features")

df.show()
print(binarizer.explainParams())
isinstance(binarizer, Transformer)

# i ostatcznie wykonanie transformacji
df_transformed = binarizer.transform(df)
df_transformed.show()

                                                                                

+------+
|values|
+------+
|   0.5|
|   0.1|
|   0.6|
|   0.9|
|   0.0|
|   0.7|
|   1.2|
+------+

inputCol: input column name. (current: values)
inputCols: input column names. (undefined)
outputCol: output column name. (default: Binarizer_8602502eb794__output, current: features)
outputCols: output column names. (undefined)
threshold: Param for threshold used to binarize continuous features. The features greater than the threshold will be binarized to 1.0. The features equal to or less than the threshold will be binarized to 0.0 (default: 0.0, current: 0.5)
thresholds: Param for array of threshold used to binarize continuous features. This is for multiple columns input. If transforming multiple columns and thresholds is not set, but threshold is set, then threshold will be applied across all columns. (undefined)
+------+--------+
|values|features|
+------+--------+
|   0.5|     0.0|
|   0.1|     0.0|
|   0.6|     1.0|
|   0.9|     1.0|
|   0.0|     0.0|
|   0.7|     1.0|
|   1.2|     

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

df1 = spark.createDataFrame([
    ('Ala ma kota.',), 
    ('Polacy nie gęsi i swój język mają.',), 
    ('Co ma być to będzie...',), 
    ('Pan Tadeusz, czyli ostatni zajazd na Litwie.',)], 
    ['zdania'])

df1.show(truncate=False)

+--------------------------------------------+
|zdania                                      |
+--------------------------------------------+
|Ala ma kota.                                |
|Polacy nie gęsi i swój język mają.          |
|Co ma być to będzie...                      |
|Pan Tadeusz, czyli ostatni zajazd na Litwie.|
+--------------------------------------------+



In [4]:
# Tokenizer dzieli łancuch znaków na tokeny na podstawie białych znaków (ang. whitespace)
tokenizer = Tokenizer(inputCol='zdania', outputCol='tokeny')
df1_transformed = tokenizer.transform(df1)
df1_transformed.show(truncate=False)

+--------------------------------------------+----------------------------------------------------+
|zdania                                      |tokeny                                              |
+--------------------------------------------+----------------------------------------------------+
|Ala ma kota.                                |[ala, ma, kota.]                                    |
|Polacy nie gęsi i swój język mają.          |[polacy, nie, gęsi, i, swój, język, mają.]          |
|Co ma być to będzie...                      |[co, ma, być, to, będzie...]                        |
|Pan Tadeusz, czyli ostatni zajazd na Litwie.|[pan, tadeusz,, czyli, ostatni, zajazd, na, litwie.]|
+--------------------------------------------+----------------------------------------------------+



In [5]:
# pobieramy przykładowy plik
!wget https://raw.githubusercontent.com/logpai/loghub/refs/heads/master/Apache/Apache_2k.log

--2024-12-25 08:26:32--  https://raw.githubusercontent.com/logpai/loghub/refs/heads/master/Apache/Apache_2k.log
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
connected. to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... 
200 OKequest sent, awaiting response... 
Length: 171239 (167K) [text/plain]
Saving to: ‘Apache_2k.log’


2024-12-25 08:26:33 (3.20 MB/s) - ‘Apache_2k.log’ saved [171239/171239]



In [6]:
!head -5 Apache_2k.log

[Sun Dec 04 04:47:44 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties
[Sun Dec 04 04:47:44 2005] [error] mod_jk child workerEnv in error state 6
[Sun Dec 04 04:51:08 2005] [notice] jk2_init() Found child 6725 in scoreboard slot 10
[Sun Dec 04 04:51:09 2005] [notice] jk2_init() Found child 6726 in scoreboard slot 8
[Sun Dec 04 04:51:09 2005] [notice] jk2_init() Found child 6728 in scoreboard slot 6


In [7]:
df2 = spark.read.text('Apache_2k.log')
reg_tokenizer = RegexTokenizer(pattern=']\s\[|]\s|^\[' ,inputCol='value', outputCol='tokens')
df_transformed = reg_tokenizer.transform(df2)
df_transformed.select(df_transformed.tokens).show(5, truncate = False)

+-------------------------------------------------------------------------------------------+
|tokens                                                                                     |
+-------------------------------------------------------------------------------------------+
|[sun dec 04 04:47:44 2005, notice, workerenv.init() ok /etc/httpd/conf/workers2.properties]|
|[sun dec 04 04:47:44 2005, error, mod_jk child workerenv in error state 6]                 |
|[sun dec 04 04:51:08 2005, notice, jk2_init() found child 6725 in scoreboard slot 10]      |
|[sun dec 04 04:51:09 2005, notice, jk2_init() found child 6726 in scoreboard slot 8]       |
|[sun dec 04 04:51:09 2005, notice, jk2_init() found child 6728 in scoreboard slot 6]       |
+-------------------------------------------------------------------------------------------+
only showing top 5 rows



In [8]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable

class CustomBinarizer(
    Transformer,
    HasInputCol,               # Sets up an inputCol parameter
    HasOutputCol,              # Sets up an outputCol parameter
    DefaultParamsReadable,     # Makes parameters readable from file
    DefaultParamsWritable      # Makes parameters writable from file
                     ):

    true_val = Param(
        Params._dummy(),
        "true_val",
        "List of values to be mapped as logical 1.",
        typeConverter=TypeConverters.toList, 
    )
    
    @keyword_only
    def __init__(self,  inputCol=None, outputCol=None, true_val: list =None):
        super().__init__()
        self._setDefault(true_val=None)
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, true_val=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)
  
    def setTrueVal(self, new_true_val):
        return self.setParams(true_val=new_true_val)
  
    # Required if you use Spark >= 3.0
    def setInputCol(self, new_inputCol):
        return self.setParams(inputCol=new_inputCol)
  
    # Required if you use Spark >= 3.0
    def setOutputCol(self, new_outputCol):
        return self.setParams(outputCol=new_outputCol)
  
    def getTrueVal(self):
        return self.getOrDefault(self.true_val)

    def _transform(self, df):
        """
        Przetwarza kolumnę ustawiając wartość na 1 jeżeli zawiera ona jedną z wartości true_val,
        w przeciwnym wypadku przypisuje wartość 0
        """
        if not self.isSet("inputCol"):
            raise ValueError(
                "No input column set for the "
                "CustomBinarizer transformer."
            )
        if not self.isSet("true_val"):
            raise ValueError(
                "You must provide list of values to map as logical True."
            )

        def binarize(val):
            """Funkcja pomocnicza"""
            if str(val) in self.getTrueVal():
                return 1
            else:
                return 0
        binarize_udf = udf(binarize, StringType())
        return df.withColumn(self.getOutputCol(), binarize_udf(df[self.getInputCol()]))

In [9]:
df3 = spark.createDataFrame([('error',), ('warning',), ('notice',), ('info',), ('error',)], ['log'])
df3.show(truncate=False)

+-------+
|log    |
+-------+
|error  |
|notice |
|info   |
|error  |
+-------+



In [10]:
cust_binarizer = CustomBinarizer(true_val=['error','warning'], inputCol='log', outputCol='important')
df3_transformed = cust_binarizer.transform(df3)
df3_transformed.show()

                                                                                

+-------+---------+
|    log|important|
+-------+---------+
|  error|        1|
| notice|        0|
|   info|        0|
|  error|        1|
+-------+---------+



In [11]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.linalg import Vectors


df3 = spark.createDataFrame([('error',), ('warning',), ('notice',), ('info',), ('error',)], ['log'])

# StringIndexer to estymator zamienia wartości kategoryczne na numeryczne poprzez podanie indeksu elementu z
# utworzonej listy wartości ze wskazanej kolumny
stringIndexer = StringIndexer(inputCol="log", outputCol="log_idx")

# metoda fit zwraca z kolej obiekt typu transformer
model = stringIndexer.fit(df3)


transformed_df3 = model.transform(df3)
transformed_df3.show()

#  teraz możemy użyć kodowania one-hot
ohe = OneHotEncoder()
# parametry można również ustawiać przez dedykowane metody zamiast przekazywać do konstruktora
# w formie kwargs
ohe.setInputCols(["log_idx"])
ohe.setOutputCols(["onehot_log"])

model = ohe.fit(transformed_df3)
transformed_df3 = model.transform(transformed_df3)
transformed_df3.show()

# typ kolumny to będzie wektor rzadki (SparseVector)
# przykładowy wektor: (3,[0],[1.0])
# oznacza, że wektor ma długość 3, kolejna wartość to indeksy, na których występują wartości w tym wektorze, a ostatni to faktyczne wartości w tym wektorze
transformed_df3.dtypes
transformed_df3.select(transformed_df3.onehot_log).head()[0]

                                                                                

+-------+-------+
|    log|log_idx|
+-------+-------+
|  error|    0.0|
| notice|    2.0|
|   info|    1.0|
|  error|    0.0|
+-------+-------+

+-------+-------+-------------+
|    log|log_idx|   onehot_log|
+-------+-------+-------------+
|  error|    0.0|(3,[0],[1.0])|
| notice|    2.0|(3,[2],[1.0])|
|   info|    1.0|(3,[1],[1.0])|
|  error|    0.0|(3,[0],[1.0])|
+-------+-------+-------------+



SparseVector(3, {0: 1.0})

In [12]:
# pobieramy dane
!wget https://raw.githubusercontent.com/shorya1996/ML_Sklearn/refs/heads/master/Multiple%20Linear%20Regression/50_Startups.csv

--2024-12-25 08:28:44--  https://raw.githubusercontent.com/shorya1996/ML_Sklearn/refs/heads/master/Multiple%20Linear%20Regression/50_Startups.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
connected. to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... 
200 OKequest sent, awaiting response... 
Length: 2386 (2.3K) [text/plain]
Saving to: ‘50_Startups.csv’


2024-12-25 08:28:44 (568 KB/s) - ‘50_Startups.csv’ saved [2386/2386]



In [13]:
!ls | grep *.csv

50_Startups.csv


In [14]:
df = spark.read.csv('50_Startups.csv', header=True, inferSchema=True)

In [15]:
df.show()

+---------+--------------+---------------+----------+---------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|
+---------+--------------+---------------+----------+---------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|
| 162597.7|     151377.59|      443898.53|California|191792.06|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|
|144372.41|     118671.85|      383199.62|  New York|182901.99|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|
| 131876.9|      99814.71|      362861.36|  New York|156991.12|
|134615.46|     147198.87|      127716.82|California|156122.51|
|130298.13|     145530.06|      323876.68|   Florida| 155752.6|
|120542.52|     148718.95|      311613.29|  New York|152211.77|
|123334.88|     108679.17|      304981.62|California|149759.96|
|101913.08|     110594.11|      229160.95|   Florida|146121.95|
|100671.96|      91790.61|      249744.55|California| 144259.4|
| 93863.75|     127320.38|      249839.4

In [16]:
df.printSchema()

root
 |-- R&D Spend: double (nullable = true)
 |-- Administration: double (nullable = true)
 |-- Marketing Spend: double (nullable = true)
 |-- State: string (nullable = true)
 |-- Profit: double (nullable = true)



In [17]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder

# zamiana wartości kategorycznych na numeryczne
indexer = StringIndexer(inputCol='State', outputCol='State_numeric')
indexer_fitted = indexer.fit(df)
df_indexed = indexer_fitted.transform(df)

# one-hot encoding
encoder = OneHotEncoder(inputCols=['State_numeric'], outputCols=['State_onehot'])
df_onehot = encoder.fit(df_indexed).transform(df_indexed)

df_onehot.printSchema()

root
 |-- R&D Spend: double (nullable = true)
 |-- Administration: double (nullable = true)
 |-- Marketing Spend: double (nullable = true)
 |-- State: string (nullable = true)
 |-- Profit: double (nullable = true)
 |-- State_numeric: double (nullable = false)
 |-- State_onehot: vector (nullable = true)



In [18]:
from pyspark.ml.functions import vector_to_array

# ta funkcja zamienia wektor rzadki w tablicę
df_col_onehot = df_onehot.select('*', vector_to_array('state_onehot').alias('col_onehot'))
df_col_onehot.show()

+---------+--------------+---------------+----------+---------+-------------+-------------+----------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|State_numeric| State_onehot|col_onehot|
+---------+--------------+---------------+----------+---------+-------------+-------------+----------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|          1.0|(2,[1],[1.0])|[0.0, 1.0]|
| 162597.7|     151377.59|      443898.53|California|191792.06|          0.0|(2,[0],[1.0])|[1.0, 0.0]|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|          2.0|    (2,[],[])|[0.0, 0.0]|
|144372.41|     118671.85|      383199.62|  New York|182901.99|          1.0|(2,[1],[1.0])|[0.0, 1.0]|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|          2.0|    (2,[],[])|[0.0, 0.0]|
| 131876.9|      99814.71|      362861.36|  New York|156991.12|          1.0|(2,[1],[1.0])|[0.0, 1.0]|
|134615.46|     147198.87|      127716.82|California|156122.51|          

In [19]:
# rozbijamy wartości z kolumny onehot na tzw. dummy data czyli nowe kolumny dla każdej cechy z wartością = 1
# jeżeli dana cecha w tym wektorze występuje, 0 w przeciwnym wypadku

import pyspark.sql.functions as f

num_categories = len(df_col_onehot.first()['col_onehot']) 
cols_expanded = [(f.col('col_onehot')[i].alias(f'{indexer_fitted.labels[i]}')) for i in range(num_categories)]
df_cols_onehot = df_col_onehot.select('*', *cols_expanded)
df_cols_onehot.show()

+---------+--------------+---------------+----------+---------+-------------+-------------+----------+----------+--------+
|R&D Spend|Administration|Marketing Spend|     State|   Profit|State_numeric| State_onehot|col_onehot|California|New York|
+---------+--------------+---------------+----------+---------+-------------+-------------+----------+----------+--------+
| 165349.2|      136897.8|       471784.1|  New York|192261.83|          1.0|(2,[1],[1.0])|[0.0, 1.0]|       0.0|     1.0|
| 162597.7|     151377.59|      443898.53|California|191792.06|          0.0|(2,[0],[1.0])|[1.0, 0.0]|       1.0|     0.0|
|153441.51|     101145.55|      407934.54|   Florida|191050.39|          2.0|    (2,[],[])|[0.0, 0.0]|       0.0|     0.0|
|144372.41|     118671.85|      383199.62|  New York|182901.99|          1.0|(2,[1],[1.0])|[0.0, 1.0]|       0.0|     1.0|
|142107.34|      91391.77|      366168.42|   Florida|166187.94|          2.0|    (2,[],[])|[0.0, 0.0]|       0.0|     0.0|
| 131876.9|     

In [20]:
# dobieramy tylko wybrane kolumny do ostatecznej ramki danych
df_final = df_cols_onehot.select("R&D Spend", "Administration", "Marketing Spend", "California", "New York", "profit")

In [21]:
df_final.show(10)

+---------+--------------+---------------+----------+--------+---------+
|R&D Spend|Administration|Marketing Spend|California|New York|   profit|
+---------+--------------+---------------+----------+--------+---------+
| 165349.2|      136897.8|       471784.1|       0.0|     1.0|192261.83|
| 162597.7|     151377.59|      443898.53|       1.0|     0.0|191792.06|
|153441.51|     101145.55|      407934.54|       0.0|     0.0|191050.39|
|144372.41|     118671.85|      383199.62|       0.0|     1.0|182901.99|
|142107.34|      91391.77|      366168.42|       0.0|     0.0|166187.94|
| 131876.9|      99814.71|      362861.36|       0.0|     1.0|156991.12|
|134615.46|     147198.87|      127716.82|       1.0|     0.0|156122.51|
|130298.13|     145530.06|      323876.68|       0.0|     0.0| 155752.6|
|120542.52|     148718.95|      311613.29|       0.0|     1.0|152211.77|
|123334.88|     108679.17|      304981.62|       1.0|     0.0|149759.96|
+---------+--------------+---------------+---------

In [22]:
from pyspark.ml.feature import VectorAssembler

# to transformer, który składa zadane cechy (kolumny) w jeden wektor cech
assembler = VectorAssembler(inputCols=df_final.columns[:-1],outputCol='features')

data_set = assembler.transform(df_final)
data_set = data_set.select(['features','profit'])
data_set.show()

+--------------------+---------+
|            features|   profit|
+--------------------+---------+
|[165349.2,136897....|192261.83|
|[162597.7,151377....|191792.06|
|[153441.51,101145...|191050.39|
|[144372.41,118671...|182901.99|
|[142107.34,91391....|166187.94|
|[131876.9,99814.7...|156991.12|
|[134615.46,147198...|156122.51|
|[130298.13,145530...| 155752.6|
|[120542.52,148718...|152211.77|
|[123334.88,108679...|149759.96|
|[101913.08,110594...|146121.95|
|[100671.96,91790....| 144259.4|
|[93863.75,127320....|141585.52|
|[91992.39,135495....|134307.35|
|[119943.24,156547...|132602.65|
|[114523.61,122616...|129917.04|
|[78013.11,121597....|126992.93|
|[94657.16,145077....|125370.37|
|[91749.16,114175....| 124266.9|
|[86419.7,153514.1...|122776.86|
+--------------------+---------+
only showing top 20 rows



In [23]:
from pyspark.ml.regression import LinearRegression

# podział danych train test
train_data,test_data = data_set.randomSplit([0.8,0.2])

# inicjalizacja modelu regresji
lr = LinearRegression(featuresCol="features",labelCol='profit', regParam=0.1)
lrModel = lr.fit(train_data)
test_stats = lrModel.evaluate(test_data)

# wypisanie wyników
print(f"RMSE: {test_stats.rootMeanSquaredError}")
print(f"R2: {test_stats.r2}")
print(f"MSE: {test_stats.meanSquaredError}")

24/12/25 08:30:46 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/12/25 08:30:46 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/12/25 08:30:46 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


RMSE: 13137.552791165203
R2: 0.9099494822056451
MSE: 172595293.34065259
