In [1]:
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StringIndexer,OneHotEncoder,RFormula
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder\
        .appName('my-spark-ML')\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/11 21:46:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv('/Users/xwyang/Desktop/data/sf-airbnb.csv',header=True,inferSchema=True)

                                                                                

In [4]:
df.write.format('parquet').mode('overwrite').save('/Users/xwyang/Desktop/parquet_dir')

24/08/11 21:46:40 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [5]:
dw = spark.read.parquet('/Users/xwyang/Desktop/parquet_dir')

In [6]:
dff = spark.read.parquet('/Users/xwyang/Desktop/myspace')

In [8]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [9]:
spark.sql(""" create or replace temp view sf_tbl using parquet options(path '/Users/xwyang/Desktop/myspace') """)

DataFrame[]

In [10]:
dd = spark.table('sf_tbl')

In [11]:
spark.sql(""" create or replace temp view sf_tbl_1 as select * from sf_tbl """)

DataFrame[]

In [12]:
data = spark.read.table('sf_tbl_1')

In [13]:
savePipe = PipelineModel.load('/Users/xwyang/Desktop/sf-airbnb')

                                                                                

In [14]:
trainDF,testDF = dff.randomSplit([0.8,0.2],seed=42)

In [15]:
predDF = savePipe.transform(testDF)

In [16]:
evaluation = RegressionEvaluator(predictionCol='prediction',
                                labelCol='price',
                                metricName='rmse')

In [17]:
rsme = evaluation.evaluate(predDF)

                                                                                

In [19]:
rsme

218.01859667614502

In [27]:
mean_value = trainDF.select(avg('price')).collect()[0][0]
baseline_df = testDF.withColumn('new_pred',lit(mean_value))
evaluation_1 = RegressionEvaluator(predictionCol='new_pred',
                                labelCol='price',
                                metricName='rmse')
rmse=evaluation_1.evaluate(baseline_df)
rmse

240.70934302168342

In [28]:
rsem_r2 = evaluation.setMetricName("r2").evaluate(predDF)
rsem_r2

0.17969345201487363

Below is a list of the commonly used machine learning models in PySpark, along with their key import statements and important parameters that you would typically need to set up each model.

## Classification Models

1. **Logistic Regression**
   ```python
   from pyspark.ml.classification import LogisticRegression

   lr = LogisticRegression(
       featuresCol='features',
       labelCol='label',
       maxIter=10,
       regParam=0.3,
       elasticNetParam=0.8
   )
   ```

2. **Decision Tree Classifier**
   ```python
   from pyspark.ml.classification import DecisionTreeClassifier

   dt = DecisionTreeClassifier(
       featuresCol='features',
       labelCol='label',
       maxDepth=5,
       impurity='gini'
   )
   ```

3. **Random Forest Classifier**
   ```python
   from pyspark.ml.classification import RandomForestClassifier

   rf = RandomForestClassifier(
       featuresCol='features',
       labelCol='label',
       numTrees=20,
       maxDepth=5
   )
   ```

4. **Gradient-Boosted Tree Classifier**
   ```python
   from pyspark.ml.classification import GBTClassifier

   gbt = GBTClassifier(
       featuresCol='features',
       labelCol='label',
       maxIter=10,
       maxDepth=5
   )
   ```

5. **Naive Bayes**
   ```python
   from pyspark.ml.classification import NaiveBayes

   nb = NaiveBayes(
       featuresCol='features',
       labelCol='label',
       smoothing=1.0,
       modelType='multinomial'
   )
   ```

6. **Multilayer Perceptron Classifier (Neural Network)**
   ```python
   from pyspark.ml.classification import MultilayerPerceptronClassifier

   mlp = MultilayerPerceptronClassifier(
       featuresCol='features',
       labelCol='label',
       layers=[4, 5, 4, 3],  # Example architecture with input layer, 2 hidden layers, and output layer
       maxIter=100
   )
   ```

7. **Linear Support Vector Machine (SVM)**
   ```python
   from pyspark.ml.classification import LinearSVC

   lsvc = LinearSVC(
       featuresCol='features',
       labelCol='label',
       maxIter=10,
       regParam=0.1
   )
   ```

## Regression Models

1. **Linear Regression**
   ```python
   from pyspark.ml.regression import LinearRegression

   lr = LinearRegression(
       featuresCol='features',
       labelCol='label',
       maxIter=10,
       regParam=0.3,
       elasticNetParam=0.8
   )
   ```

2. **Decision Tree Regressor**
   ```python
   from pyspark.ml.regression import DecisionTreeRegressor

   dtr = DecisionTreeRegressor(
       featuresCol='features',
       labelCol='label',
       maxDepth=5,
       impurity='variance'
   )
   ```

3. **Random Forest Regressor**
   ```python
   from pyspark.ml.regression import RandomForestRegressor

   rfr = RandomForestRegressor(
       featuresCol='features',
       labelCol='label',
       numTrees=20,
       maxDepth=5
   )
   ```

4. **Gradient-Boosted Tree Regressor**
   ```python
   from pyspark.ml.regression import GBTRegressor

   gbtr = GBTRegressor(
       featuresCol='features',
       labelCol='label',
       maxIter=10,
       maxDepth=5
   )
   ```

5. **Isotonic Regression**
   ```python
   from pyspark.ml.regression import IsotonicRegression

   ir = IsotonicRegression(
       featuresCol='features',
       labelCol='label'
   )
   ```

6. **Generalized Linear Regression**
   ```python
   from pyspark.ml.regression import GeneralizedLinearRegression

   glr = GeneralizedLinearRegression(
       featuresCol='features',
       labelCol='label',
       family='gaussian',
       link='identity',
       maxIter=10,
       regParam=0.3
   )
   ```

## Clustering Models

1. **K-Means Clustering**
   ```python
   from pyspark.ml.clustering import KMeans

   kmeans = KMeans(
       featuresCol='features',
       k=3,
       maxIter=20,
       seed=1
   )
   ```

2. **Gaussian Mixture Model (GMM)**
   ```python
   from pyspark.ml.clustering import GaussianMixture

   gmm = GaussianMixture(
       featuresCol='features',
       k=2,
       maxIter=100
   )
   ```

3. **Bisecting K-Means**
   ```python
   from pyspark.ml.clustering import BisectingKMeans

   bkm = BisectingKMeans(
       featuresCol='features',
       k=2,
       maxIter=20
   )
   ```

4. **Latent Dirichlet Allocation (LDA)**
   ```python
   from pyspark.ml.clustering import LDA

   lda = LDA(
       featuresCol='features',
       k=3,
       maxIter=10
   )
   ```

## Recommendation Models

1. **Alternating Least Squares (ALS) for Collaborative Filtering**
   ```python
   from pyspark.ml.recommendation import ALS

   als = ALS(
       userCol='userId',
       itemCol='movieId',
       ratingCol='rating',
       maxIter=10,
       regParam=0.01,
       coldStartStrategy='drop'
   )
   ```

## Feature Extraction and Transformation Models

1. **Principal Component Analysis (PCA)**
   ```python
   from pyspark.ml.feature import PCA

   pca = PCA(
       k=3,
       inputCol='features',
       outputCol='pcaFeatures'
   )
   ```

2. **Word2Vec**
   ```python
   from pyspark.ml.feature import Word2Vec

   word2vec = Word2Vec(
       vectorSize=100,
       inputCol='text',
       outputCol='result'
   )
   ```

3. **Chi-Squared Selector**
   ```python
   from pyspark.ml.feature import ChiSqSelector

   selector = ChiSqSelector(
       numTopFeatures=50,
       featuresCol='features',
       labelCol='label',
       outputCol='selectedFeatures'
   )
   ```

4. **Standard Scaler**
   ```python
   from pyspark.ml.feature import StandardScaler

   scaler = StandardScaler(
       inputCol='features',
       outputCol='scaledFeatures',
       withMean=True,
       withStd=True
   )
   ```

5. **MinMax Scaler**
   ```python
   from pyspark.ml.feature import MinMaxScaler

   scaler = MinMaxScaler(
       inputCol='features',
       outputCol='scaledFeatures'
   )
   ```

6. **MaxAbs Scaler**
   ```python
   from pyspark.ml.feature import MaxAbsScaler

   scaler = MaxAbsScaler(
       inputCol='features',
       outputCol='scaledFeatures'
   )
   ```

7. **Bucketizer**
   ```python
   from pyspark.ml.feature import Bucketizer

   bucketizer = Bucketizer(
       splits=[-float('inf'), 0.0, float('inf')],
       inputCol='features',
       outputCol='bucketedFeatures'
   )
   ```

8. **Quantile Discretizer**
   ```python
   from pyspark.ml.feature import QuantileDiscretizer

   discretizer = QuantileDiscretizer(
       numBuckets=3,
       inputCol='features',
       outputCol='bucketedFeatures'
   )
   ```

9. **Polynomial Expansion**
   ```python
   from pyspark.ml.feature import PolynomialExpansion

   polyExpansion = PolynomialExpansion(
       degree=2,
       inputCol='features',
       outputCol='polyFeatures'
   )
   ```

### Summary
For each model, you should import the corresponding class and initialize it with the parameters that best suit your data and problem. You can adjust the parameters according to your needs to control aspects such as model complexity, regularization, maximum iterations, etc. These parameters are often the key to tuning and optimizing your model's performance.