In [1]:
pip list

Package                       Version
----------------------------- ---------
alembic                       1.8.0
altair                        4.2.0
anyio                         3.6.1
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
asttokens                     2.0.5
async-generator               1.10
attrs                         21.4.0
Babel                         2.10.3
backcall                      0.2.0
backports.functools-lru-cache 1.6.4
beautifulsoup4                4.11.1
bleach                        5.0.1
blinker                       1.4
bokeh                         2.4.3
Bottleneck                    1.3.5
brotlipy                      0.7.0
cached-property               1.5.2
certifi                       2022.6.15
certipy                       0.1.3
cffi                          1.15.1
charset-normalizer            2.1.0
click                         8.1.3
cloudpickle                   2.1.0
colorama                      0.4.5
conda          

In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession \
  .builder \
  .appName("multi key test") \
  .master("local[2]") \
  .getOrCreate()

sc= spark.sparkContext

In [2]:
# Sum of the first 100 Whole numbers
rdd = sc.parallelize(range(100000000 +1))
rdd.sum()

5000000050000000

In [3]:
import numpy as np
import pandas as pd

# load the boston data set
from sklearn.datasets import load_boston
boston = load_boston()

# convert to a Pandas Data Frame
boston_pd = pd.DataFrame(data= np.c_[boston['data'],boston['target']], 
              columns= np.append(boston['feature_names'], 'target')).sample(frac=1)
print(boston_pd.shape)
boston_pd.head(5)

(506, 14)



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
128,0.32543,0.0,21.89,0.0,0.624,6.431,98.8,1.8125,4.0,437.0,21.2,396.9,15.39,18.0
320,0.1676,0.0,7.38,0.0,0.493,6.426,52.3,4.5404,5.0,287.0,19.6,396.9,7.2,23.8
187,0.07875,45.0,3.44,0.0,0.437,6.782,41.1,3.7886,5.0,398.0,15.2,393.87,6.68,32.0
197,0.04666,80.0,1.52,0.0,0.404,7.107,36.6,7.309,2.0,329.0,12.6,354.31,8.61,30.3
315,0.25356,0.0,9.9,0.0,0.544,5.705,77.7,3.945,4.0,304.0,18.4,396.42,11.5,16.2


In [4]:
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr

# split into data and label arrays 
y = boston_pd['target']
X = boston_pd.drop(['target'], axis=1)

# create training (~80%) and test data sets
X_train = X[:400]
X_test = X[400:]
y_train = y[:400]
y_test = y[400:]

# train a classifier 
lr = LinearRegression()
model = lr.fit(X_train, y_train)

# make predictions
y_pred = model.predict(X_test)

# error metrics
r = pearsonr(y_pred, y_test)
mae = sum(abs(y_pred - y_test))/len(y_test)
print("R-sqaured: " + str(r[0]**2))
print("MAE: " + str(mae))

R-sqaured: 0.7600288746579493
MAE: 3.073334194582059


In [5]:
from pyspark.ml.feature import VectorAssembler

# convert to a Spark data frame
boston_sp = spark.createDataFrame(boston_pd)
display(boston_sp.take(5))

# split into training and test spark data frames
boston_train = spark.createDataFrame(boston_pd[:400])
boston_test = spark.createDataFrame(boston_pd[400:])

# convert to vector representation for MLlib
assembler = VectorAssembler(inputCols= boston_train.schema.names[:(boston_pd.shape[1] - 1)],  
                                                                        outputCol="features" )
boston_train = assembler.transform(boston_train).select('features', 'target') 
boston_test = assembler.transform(boston_test).select('features', 'target') 

display(boston_train.take(5))

[Row(CRIM=0.32543, ZN=0.0, INDUS=21.89, CHAS=0.0, NOX=0.624, RM=6.431, AGE=98.8, DIS=1.8125, RAD=4.0, TAX=437.0, PTRATIO=21.2, B=396.9, LSTAT=15.39, target=18.0),
 Row(CRIM=0.1676, ZN=0.0, INDUS=7.38, CHAS=0.0, NOX=0.493, RM=6.426, AGE=52.3, DIS=4.5404, RAD=5.0, TAX=287.0, PTRATIO=19.6, B=396.9, LSTAT=7.2, target=23.8),
 Row(CRIM=0.07875, ZN=45.0, INDUS=3.44, CHAS=0.0, NOX=0.437, RM=6.782, AGE=41.1, DIS=3.7886, RAD=5.0, TAX=398.0, PTRATIO=15.2, B=393.87, LSTAT=6.68, target=32.0),
 Row(CRIM=0.04666, ZN=80.0, INDUS=1.52, CHAS=0.0, NOX=0.404, RM=7.107, AGE=36.6, DIS=7.309, RAD=2.0, TAX=329.0, PTRATIO=12.6, B=354.31, LSTAT=8.61, target=30.3),
 Row(CRIM=0.25356, ZN=0.0, INDUS=9.9, CHAS=0.0, NOX=0.544, RM=5.705, AGE=77.7, DIS=3.945, RAD=4.0, TAX=304.0, PTRATIO=18.4, B=396.42, LSTAT=11.5, target=16.2)]

[Row(features=DenseVector([0.3254, 0.0, 21.89, 0.0, 0.624, 6.431, 98.8, 1.8125, 4.0, 437.0, 21.2, 396.9, 15.39]), target=18.0),
 Row(features=DenseVector([0.1676, 0.0, 7.38, 0.0, 0.493, 6.426, 52.3, 4.5404, 5.0, 287.0, 19.6, 396.9, 7.2]), target=23.8),
 Row(features=DenseVector([0.0788, 45.0, 3.44, 0.0, 0.437, 6.782, 41.1, 3.7886, 5.0, 398.0, 15.2, 393.87, 6.68]), target=32.0),
 Row(features=DenseVector([0.0467, 80.0, 1.52, 0.0, 0.404, 7.107, 36.6, 7.309, 2.0, 329.0, 12.6, 354.31, 8.61]), target=30.3),
 Row(features=DenseVector([0.2536, 0.0, 9.9, 0.0, 0.544, 5.705, 77.7, 3.945, 4.0, 304.0, 18.4, 396.42, 11.5]), target=16.2)]

In [6]:
# linear regresion with Spark
from pyspark.ml.regression import LinearRegression

# linear regression 
lr = LinearRegression(maxIter=10, regParam=0.1, 
                      elasticNetParam=0.5, labelCol="target")

# Fit the model
model = lr.fit(boston_train)
boston_pred = model.transform(boston_test)

# calculate results 
r = boston_pred.stat.corr("prediction", "target")
print("R-sqaured: " + str(r**2))

R-sqaured: 0.7461414801327885


In [7]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

crossval = CrossValidator(estimator=LinearRegression(labelCol = "target"),  
                         estimatorParamMaps=ParamGridBuilder().addGrid(
                           LinearRegression.elasticNetParam, [0, 0.5, 1.0]).build(),
                         evaluator=RegressionEvaluator(
                           labelCol = "target", metricName = "r2"),
                         numFolds=10)

# cross validate the model and select the best fit
cvModel = crossval.fit(boston_train) 
model = cvModel.bestModel

# calculate results 
boston_pred = model.transform(boston_test)
r = boston_pred.stat.corr("prediction", "target")
print("R-sqaured: " + str(r**2))

R-sqaured: 0.7600288746579517


In [8]:
# sklearn version 
from sklearn.ensemble import RandomForestRegressor as RFR
from multiprocessing.pool import ThreadPool

# allow up to 5 concurrent threads
pool = ThreadPool(5)

# hyperparameters to test out (n_trees)
parameters = [ 10, 20, 50]

# define a function to train a RF model and return metrics 
def sklearn_random_forest(trees, X_train, X_test, y_train, y_test):

    # train a random forest regressor with the specified number of trees
    rf= RFR(n_estimators = trees)
    model = rf.fit(X_train, y_train)

    # make predictions
    y_pred = model.predict(X_test)
    r = pearsonr(y_pred, y_test)

    # return the number of trees, and the R value 
    return [trees, r[0]**2]  

# run the tasks 
pool.map(lambda trees: sklearn_random_forest(trees, X_train,
                                           X_test, y_train, y_test), parameters)

[[10, 0.786279363847181], [20, 0.8170691225800597], [50, 0.7997018115080141]]

In [9]:
# spark version
from pyspark.ml.regression import RandomForestRegressor

# define a function to train a RF model and return metrics 
def mllib_random_forest(trees, boston_train, boston_test):

    # train a random forest regressor with the specified number of trees
    rf = RandomForestRegressor(numTrees = trees, labelCol="target")
    model = rf.fit(boston_train)

    # make predictions
    boston_pred = model.transform(boston_test)
    r = boston_pred.stat.corr("prediction", "target")

    # return the number of trees, and the R value 
    return [trees, r**2]
  
# run the tasks 
pool.map(lambda trees: mllib_random_forest(trees, boston_train, boston_test), parameters)

[[10, 0.842436670938509], [20, 0.8993035953885388], [50, 0.8723947471442912]]

In [10]:
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import *

# setup the spark data frame as a table
boston_sp.createOrReplaceTempView("boston")

# add train/test label and expand the data set by 3x (each num trees parameter)
full_df = spark.sql("""
  select *
  from (
    select *, case when rand() < 0.8 then 1 else 0 end as training 
    from boston
  ) b
  cross join (
      select 11 as trees union all select 20 as trees union all select 50 as trees)
""")

schema = StructType([StructField('trees', LongType(), True),
                     StructField('r_squared', DoubleType(), True)])  

#@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def train_RF(boston_pd):
    trees = boston_pd['trees'].unique()[0]

    # get the train and test groups 
    boston_train = boston_pd[boston_pd['training'] == 1]
    boston_test = boston_pd[boston_pd['training'] == 0] 
        
    # create data and label groups 
    y_train = boston_train['target']
    X_train = boston_train.drop(['target'], axis=1)
    y_test = boston_test['target']
    X_test = boston_test.drop(['target'], axis=1)
   
    # train a classifier 
    rf= RFR(n_estimators = trees)
    model = rf.fit(X_train, y_train)

    # make predictions
    y_pred = model.predict(X_test)
    r = pearsonr(y_pred, y_test)
    
    # return the number of trees, and the R value 
    return pd.DataFrame({'trees': trees, 'r_squared': (r[0]**2)}, index=[0])
  
# use the Pandas UDF
# results = full_df.groupby('trees').apply(train_RF)

# print the results 
# print(results.take(3))

full_df.groupby('trees').applyInPandas(
    train_RF, schema="trees long, r_squared double").show()  

+-----+------------------+
|trees|         r_squared|
+-----+------------------+
|   11| 0.915390018129497|
|   20|0.9059665054820668|
|   50| 0.912715138678595|
+-----+------------------+



In [11]:
import pandas as pd  
from pyspark.sql.functions import pandas_udf, ceil
df = spark.createDataFrame(
    [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
    ("id", "v"))  
def normalize(pdf):
    v = pdf.v
    return pdf.assign(v=(v - v.mean()) / v.std())
df.groupby("id").applyInPandas(
    normalize, schema="id long, v double").show() 

+---+-------------------+
| id|                  v|
+---+-------------------+
|  1|-0.7071067811865475|
|  1| 0.7071067811865475|
|  2|-0.8320502943378437|
|  2|-0.2773500981126146|
|  2| 1.1094003924504583|
+---+-------------------+

