In [1]:
# import findspark
# findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
# May take awhile locally
spark = SparkSession.builder.appName("Review2").getOrCreate()

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark

You are working with 1 core(s)


In [5]:
# Small dataframe for quick testing if you need it
df = spark.createDataFrame([(3,69,57,56,678,345),(3,67,56,58,678,345),(3,67,54,57,678,345),(3,68,55,58,678,345),(3,68,53,52,678,345)
                           ,(2,11,10,907,16,458),(2,12,14,909,12,456),(2,11,13,910,10,459),(2,12,11,905,16,459),(2,10,13,902,10,459)
                           ,(1,30,11,123,568,891),(1,32,12,124,567,890),(1,34,10,123,566,895),(1,35,15,121,564,894),(1,30,12,124,560,896)], 
                           ['flower_type', 'sepal_len','sepal_width','R','G','B'])

## Simple Customer Segment data for clustering

path = "Datasets/Mall_Customers.csv"

**Content**

You are owing a supermarket mall and through membership cards, you have some basic data about your customers like Customer ID, age, gender, annual income and spending score. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data.

Problem Statement You own the mall and want to understand the customers like who can be easily converge [Target Customers] so that the sense can be given to marketing team and plan the strategy accordingly.

**Source:** https://www.kaggle.com/vjchoudhary7/customer-segmentation-tutorial-in-python

## Let's read our dataset in for this notebook 

### Success of Bank Telemarketing

### Input variables:

1. Age (numeric)
2. Job: career type (categorical: 'admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed')
3. Marital_Status: marital status (categorical: 'divorced', 'married', 'single'; note: 'divorced' means divorced or widowed)
4. Education: (categorical: 'Primary_Education', 'Professional_Education', 'Secondary_Education', 'Tertiary_Education')
5. Default_Credit: has a credit in default? (binary: 'yes', 'no')
6. Housing_Loan: has a home loan? (binary: 'yes', 'no')
7. Personal_Loan: has a personal loan? (binary: 'yes', 'no')

### Output variable (desired target):

- Subscribed - has the client subscribed a long-term deposit? (binary: 'yes', 'no')

### Source
https://www.kaggle.com/raosuny/success-of-bank-telemarketing-data

In [None]:
path =""
df = spark.read.csv(path+'telemarket_success.csv',inferSchema=True,header=True)

## MinMaxScaler

See if this can used to correct negative values for input for Naive Bayes classifier

In [4]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.1, -1.0]),),
    (1, Vectors.dense([2.0, 1.1, 1.0]),),
    (2, Vectors.dense([3.0, 10.1, 3.0]),)
], ["id", "features"])

scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures",min=0,max=100)

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(dataFrame)

# rescale each feature to range [min, max].
scaledData = scalerModel.transform(dataFrame)
print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
scaledData.select("features", "scaledFeatures").show()

Features scaled to range: [0.000000, 100.000000]
+--------------+-------------------+
|      features|     scaledFeatures|
+--------------+-------------------+
|[1.0,0.1,-1.0]|      [0.0,0.0,0.0]|
| [2.0,1.1,1.0]|   [50.0,10.0,50.0]|
|[3.0,10.1,3.0]|[100.0,100.0,100.0]|
+--------------+-------------------+



In [5]:
scalerModel.isDistributed()

AttributeError: 'MinMaxScalerModel' object has no attribute 'isDistributed'

## MLflow test drive

**Code Source:** https://towardsdatascience.com/complete-data-science-project-template-with-mlflow-for-non-dummies-d082165559eb

In [9]:
import os
import mlflow
import mlflow.spark
# from project.utility.mlflow import log_artifacts_minio

train = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.1, -1.0]),),
    (1, Vectors.dense([2.0, 1.1, 1.0]),),
    (2, Vectors.dense([3.0, 10.1, 3.0]),)
], ["id", "features"])

experiment = 'test1'
mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
mlflow.set_experiment(experiment)

degree = 3
with mlflow.start_run() as run:
    feature_pipeline = features.train_new_feature_pipeline(train, degree)
    mlflow.log_param("degree", degree)
    mlflow.spark.save_model(
        feature_pipeline,
        'feature_pipeline',
        sample_input=train.select(
            'features'
        )
    )
#     log_artifacts_minio(run, 'feature_pipeline','feature_pipeline', True)

#         mlflow.log_param("alpha", alpha)
#         mlflow.log_param("l1_ratio", l1_ratio)
#         mlflow.log_metric("rmse", rmse)
#         mlflow.log_metric("r2", r2)
#         mlflow.log_metric("mae", mae)

#         mlflow.sklearn.log_model(lr, "model")
    run_id = run.info.run_id
    print(run.info)

KeyError: 'MLFLOW_TRACKING_URI'

In [12]:
import os
import warnings
import sys

from pyspark.ml.classification import *
from pyspark.ml.evaluation import *

# import pandas as pd
import numpy as np
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import ElasticNet

import mlflow
# import mlflow.sklearn


# def eval_metrics(actual, pred):
#     rmse = np.sqrt(mean_squared_error(actual, pred))
#     mae = mean_absolute_error(actual, pred)
#     r2 = r2_score(actual, pred)
#     return rmse, mae, r2



if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Read the wine-quality csv file (make sure you're running this from the root of MLflow!)
#     wine_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "wine-quality.csv")
#     data = pd.read_csv(wine_path)
    data = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.1, -1.0]),),
    (1, Vectors.dense([2.0, 1.1, 1.0]),),
    (1, Vectors.dense([2.0, 1.1, 1.0]),),
    (2, Vectors.dense([2.0, 1.1, 1.0]),),
    (2, Vectors.dense([3.0, 10.1, 3.0]),)
], ["label", "features"])

    # Split the data into training and test sets. (0.75, 0.25) split.
#     train, test = train_test_split(data)
    train,test = data.randomSplit([0.7,0.3])

    # The predicted column is "quality" which is a scalar from [3, 9]
#     train_x = train.drop(["quality"], axis=1)
#     test_x = test.drop(["quality"], axis=1)
#     train_y = train[["quality"]]
#     test_y = test[["quality"]]

#     alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5
#     l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5

    with mlflow.start_run():
#         lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
#         lr.fit(train_x, train_y)
        classifier = LogisticRegression()
        fitModel = classifier.fit(train)

#         predicted_qualities = lr.predict(test_x)
        
        # Evaluate
        predictionAndLabels = fitModel.evaluate(test)
        predictionAndLabels = predictionAndLabels.predictions.select('label','prediction')

        predictions = fitModel.transform(test)
        MC_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
        accuracy = (MC_evaluator.evaluate(predictions))*100
        print(accuracy)

#         (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

#         print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
#         print("  RMSE: %s" % rmse)
#         print("  MAE: %s" % mae)
#         print("  R2: %s" % r2)

#         mlflow.log_param("alpha", alpha)
#         mlflow.log_param("l1_ratio", l1_ratio)
#         mlflow.log_metric("rmse", rmse)
#         mlflow.log_metric("r2", r2)
#         mlflow.log_metric("mae", mae)
        mlflow.log_metric("accuracy", accuracy)

#         mlflow.spark.log_model(lr, "model")
        mlflow.spark.log_model(fitModel, "model")

nan


## Does PySpark Accept user input?

In [5]:
val = input("Enter your value: ") 
print(val) 

Enter your value: yes
yes


## For testing for binary classification

In [None]:
final_data = spark.read.format("libsvm").load("C:/spark-2.3.3-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")
train,test = final_data.randomSplit([0.7,0.3])

# Cool Dataset for Clustering

This case requires to develop a customer segmentation to define marketing strategy. The sample Dataset summarizes the usage behavior of about 9000 active credit card holders during the last 6 months. The file is at a customer level with 18 behavioral variables.

Following is the Data Dictionary for Credit Card dataset :-

CUST_ID : Identification of Credit Card holder (Categorical) BALANCE : Balance amount left in their account to make purchases ( BALANCE_FREQUENCY : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated) PURCHASES : Amount of purchases made from account ONEOFF_PURCHASES : Maximum purchase amount done in one-go INSTALLMENTS_PURCHASES : Amount of purchase done in installment CASH_ADVANCE : Cash in advance given by the user PURCHASES_FREQUENCY : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased) ONEOFFPURCHASESFREQUENCY : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased) PURCHASESINSTALLMENTSFREQUENCY : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done) CASHADVANCEFREQUENCY : How frequently the cash in advance being paid CASHADVANCETRX : Number of Transactions made with "Cash in Advanced" PURCHASES_TRX : Numbe of purchase transactions made CREDIT_LIMIT : Limit of Credit Card for user PAYMENTS : Amount of Payment done by user MINIMUM_PAYMENTS : Minimum amount of payments made by user PRCFULLPAYMENT : Percent of full payment paid by user TENURE : Tenure of credit card service for user

**Source:** https://www.kaggle.com/arjunbhasin2013/ccdata

In [None]:
path =""
df = spark.read.csv(path+'credit_card_clustering.csv',inferSchema=True,header=True)

## Another cool clustering dataset (SEC)

#### Context

This dataset is a playground for fundamental and technical analysis. It is said that 30% of traffic on stocks is already generated by machines, can trading be fully automated? If not, there is still a lot to learn from historical data.

#### Content

Dataset consists of following files:

prices.csv: raw, as-is daily prices. Most of data spans from 2010 to the end 2016, for companies new on stock market date range is shorter. There have been approx. 140 stock splits in that time, this set doesn't account for that.
prices-split-adjusted.csv: same as prices, but there have been added adjustments for splits.
securities.csv: general description of each company with division on sectors
fundamentals.csv: metrics extracted from annual SEC 10K fillings (2012-2016), should be enough to derive most of popular fundamental indicators.

**Source:** https://www.kaggle.com/dgawlik/nyse

In [None]:
path ="nyse"
df = spark.read.csv(path+'fundamentals.csv',inferSchema=True,header=True)

# Evaluating kurtosis

In [19]:
from pyspark.sql import functions as f
val1_df = spark.createDataFrame([(1,1),(1,2),(1,3),(2,60000000000000),(1,3),(1,2),(3,1)],['id','value'])
val1_df.agg(f.kurtosis("value")).show()
# val1_df.show()

+-------------------+
|    kurtosis(value)|
+-------------------+
|-1.5000000000000002|
+-------------------+



In [20]:
# Colby Ford
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
# Create initial LinearRegression model
lr = LinearRegression(labelCol="label", featuresCol="features")
# Create ParamGrid for Cross Validation
lrparamGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.001, 0.01, 0.1, 0.5, 1.0, 2.0])
#  .addGrid(lr.regParam, [0.01, 0.1, 0.5])
             .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
#  .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10, 20, 50])
#  .addGrid(lr.maxIter, [1, 5, 10])
             .build())
# Evaluate model
lrevaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
# Create 5-fold CrossValidator
lrcv = CrossValidator(estimator = lr,
estimatorParamMaps = lrparamGrid,
evaluator = lrevaluator,
numFolds = 5)
# Run cross validations
lrcvModel = lrcv.fit(train)
print(lrcvModel)
# Get Model Summary Statistics
lrcvSummary = lrcvModel.bestModel.summary
print("Coefficient Standard Errors: " + str(lrcvSummary.coefficientStandardErrors))
print("P Values: " + str(lrcvSummary.pValues)) # Last element is the intercept
# Use test set here so we can measure the accuracy of our model on new data
lrpredictions = lrcvModel.transform(test)
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
print('RMSE:', lrevaluator.evaluate(lrpredictions))

NameError: name 'train' is not defined

In [50]:
#problem solving
from pyspark.ml.classification import *

classifiers = [
                LogisticRegression()
               ,OneVsRest()
               ,LinearSVC()
#                ,NaiveBayes()
#                ,RandomForestClassifier()
#                ,GBTClassifier()
#                ,DecisionTreeClassifier()
#                ,MultilayerPerceptronClassifier()
              ] 

#set up your results2 table
columns = ['Classifier', 'Result']
vals = [("LogisticRegression","23.8"),('OneVsRest',"34.5"),('LinearSVC','76.3')]
results = spark.createDataFrame(vals, columns)

for y in range(0,3):
    #set up your results2 table
    columns = ['Classifier', 'Result']
    vals = [("Place Holder","N/A")]
    results2 = spark.createDataFrame(vals, columns)
    for classifier in classifiers:
        score = ['52.3']
        # Intstantiate Model
        M = classifier
        # Learn what it is
        Mtype = type(M).__name__
        Mtype = [Mtype]
        new_result = spark.createDataFrame(zip(Mtype,score), schema=columns)
#         print("NewResult")
#         new_result.show()
        results2 = results2.union(new_result)
        print("Results2")
        results2.show()
    print("Join with results")
    print("Replicate DF")
results = results.join(results2, ["Classifier"],"left")
print("Results3")
results.show()
#     results2 = results2.where("Classifier!='Place Holder'")
# results = results.join(results2, ["Classifier"],"inner")

results3.show(100,False)

Results2
+------------------+------+
|        Classifier|Result|
+------------------+------+
|      Place Holder|   N/A|
|LogisticRegression|  52.3|
+------------------+------+

Results2
+------------------+------+
|        Classifier|Result|
+------------------+------+
|      Place Holder|   N/A|
|LogisticRegression|  52.3|
|         OneVsRest|  52.3|
+------------------+------+

Results2
+------------------+------+
|        Classifier|Result|
+------------------+------+
|      Place Holder|   N/A|
|LogisticRegression|  52.3|
|         OneVsRest|  52.3|
|         LinearSVC|  52.3|
+------------------+------+

Replicate DF
Results2
+------------------+------+
|        Classifier|Result|
+------------------+------+
|      Place Holder|   N/A|
|LogisticRegression|  52.3|
+------------------+------+

Results2
+------------------+------+
|        Classifier|Result|
+------------------+------+
|      Place Holder|   N/A|
|LogisticRegression|  52.3|
|         OneVsRest|  52.3|
+-------------

In [46]:
for classifier in classifiers:
    score = ['52.3']
    # Intstantiate Model
    M = classifier
    # Learn what it is
    Mtype = type(M).__name__
    Mtype = [Mtype]
    new_result = spark.createDataFrame(zip(Mtype,score), schema=columns)
#     print("Results2")
#     new_result.show()
    results2 = results2.union(new_result)
results2.show()

+------------------+------+
|        Classifier|Result|
+------------------+------+
|      Place Holder|   N/A|
|         LinearSVC|  52.3|
|         LinearSVC|  52.3|
|         LinearSVC|  52.3|
|LogisticRegression|  52.3|
|         OneVsRest|  52.3|
|         LinearSVC|  52.3|
+------------------+------+



In [None]:
from pyspark.ml.feature import ChiSqSelector
selector = ChiSqSelector(numTopFeatures=5, outputCol="selectedFeatures")
model = selector.fit(final_data)
print(model.selectedFeatures)
test = model.transform(final_data)
test = test.select("label","selectedFeatures")
test2 = test.withColumnRenamed("selectedFeatures","features")

In [71]:
#Feature importances for feature selection in regression
# To be used for all decision tree analysis
import numpy
from numpy import allclose
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# PREPARE DATA
df = spark.createDataFrame([
    (1.1, Vectors.dense([1.0, 0.1, -1.0]),),
    (2.4, Vectors.dense([2.0, 1.1, 1.0]),),
    (5.7, Vectors.dense([3.0, 10.1, 3.0]),)
], ["label", "features"])

#Evaluator
revaluator = RegressionEvaluator(metricName="rmse")

#regressor
regressor = GBTRegressor()

#Now train with cross val
paramGrid = (ParamGridBuilder() \
               .addGrid(regressor.maxDepth, [2, 5, 10])
               .addGrid(regressor.maxBins, [5, 10, 20])
             .build())

#Cross Validator requires all of the following parameters:
crossval = CrossValidator(estimator=regressor,
                          estimatorParamMaps=paramGrid,
                          evaluator=revaluator,
                          numFolds=2) # 3 is best practice

# Run cross validations
fitModel = crossval.fit(df)

# Get Best Model
ModelSummary = fitModel.bestModel

# FEATURE IMPORTANCES
# Estimate of the importance of each feature.
# Each feature’s importance is the average of its importance across all trees 
# in the ensemble The importance vector is normalized to sum to 1. 
print(ModelSummary.featureImportances)

(3,[0],[1.0])


In [147]:
import numpy
from numpy import allclose
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier

# PREPARE DATA
df = spark.createDataFrame([
    (1.0, Vectors.dense([1.0, 0.1, -1.0]),),
    (2.0, Vectors.dense([2.0, 1.1, 1.0]),),
    (3.0, Vectors.dense([3.0, 10.1, 3.0]),)
], ["label", "features"])

# BUILD THE MODEL
rf = RandomForestClassifier()

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

#Now train with cross val
paramGrid = (ParamGridBuilder() \
               .addGrid(regressor.maxDepth, [2, 5, 10])
               .addGrid(regressor.maxBins, [5, 10, 20])
             .build())

#Cross Validator requires all of the following parameters:
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2) # 3 is best practice

# Fit model using crossval
model = crossval.fit(df)

# Get Best Model
BestModel = model.bestModel

# FEATURE IMPORTANCES
print(BestModel.featureImportances)

(3,[0,2],[0.6936974789915966,0.3063025210084034])


In [163]:
feature_imp_array = BestModel.featureImportances.toArray()
best_n_features = feature_imp_array.argsort()[-1:][::-1]
best_n_features= best_n_features.tolist()

In [170]:
Mtype = "Regresson"
def f():
    global Mtype_featureimp
    Mtype_featureimp = "inside function"
    print(Mtype_featureimp)
    
f()

print(Mtype_featureimp)

inside function
inside function


In [171]:
from pyspark.ml.classification import LogisticRegression

# Load training data
df = spark.createDataFrame([
    (1.0, Vectors.dense([1.0, 0.1, -1.0]),),
    (2.0, Vectors.dense([2.0, 1.1, 1.0]),),
    (4.0, Vectors.dense([2.0, 1.1, 1.0]),),
    (3.0, Vectors.dense([3.0, 10.1, 3.0]),)
], ["label", "features"])

lr = LogisticRegression()

# Fit the model
lrModel = lr.fit(df)

In [172]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

Coefficients: 
DenseMatrix([[ -7.41554638,  -0.31549257,  -0.78537276],
             [ -0.37571401,  -0.78437003, -10.87159487],
             [  5.46619968,  -1.07892466,   2.85277696],
             [ -3.13863114,   3.25191417,   5.9536392 ],
             [  5.46369185,  -1.0731269 ,   2.85055146]])
Intercept: [-3.152207020523129,9.635285314580297,0.03224977160900043,-6.548441471338726,0.033113405672556884]


In [173]:
summary = lrModel.summary

In [175]:
lrModel.coefficientMatrix.toArray()

array([[ -7.41554638,  -0.31549257,  -0.78537276],
       [ -0.37571401,  -0.78437003, -10.87159487],
       [  5.46619968,  -1.07892466,   2.85277696],
       [ -3.13863114,   3.25191417,   5.9536392 ],
       [  5.46369185,  -1.0731269 ,   2.85055146]])

In [176]:
from pyspark.ml.classification import *

# Load training data
df = spark.createDataFrame([
    (1.0, Vectors.dense([1.0, 0.1, -1.0]),),
    (2.0, Vectors.dense([2.0, 1.1, 1.0]),),
    (4.0, Vectors.dense([2.0, 1.1, 1.0]),),
    (3.0, Vectors.dense([3.0, 10.1, 3.0]),)
], ["label", "features"])

# instantiate the base classifier.
lr = LogisticRegression()
# instantiate the One Vs Rest Classifier.
OVRclassifier = OneVsRest(classifier=lr)
# Add parameters of your choice here:
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()
#Cross Validator requires the following parameters:
crossval = CrossValidator(estimator=OVRclassifier,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=2) # 3 is best practice
# Run cross-validation, and choose the best set of parameters.
fitModel = crossval.fit(df)

# Get Best Model
BestModel = fitModel.bestModel

In [139]:
model1 = BestModel.models[0]

In [140]:
BestModel.models

[LogisticRegressionModel: uid = LogisticRegression_84191fe3c3a4, numClasses = 1, numFeatures = 3,
 LogisticRegressionModel: uid = LogisticRegression_84191fe3c3a4, numClasses = 2, numFeatures = 3,
 LogisticRegressionModel: uid = LogisticRegression_84191fe3c3a4, numClasses = 2, numFeatures = 3,
 LogisticRegressionModel: uid = LogisticRegression_84191fe3c3a4, numClasses = 2, numFeatures = 3,
 LogisticRegressionModel: uid = LogisticRegression_84191fe3c3a4, numClasses = 2, numFeatures = 3]

In [126]:
type(model1)

pyspark.ml.classification.LogisticRegressionModel

In [128]:
print(model1.coefficients)

[0.0,0.0,0.0]


In [137]:
# Get Best Model
# BestModel = fitModel.bestModel

models = BestModel.models
for model in models:
    print('Intercept: ',model.intercept,'\nCoefficients:',model.coefficients)

Intercept:  -inf 
Coefficients: [0.0,0.0,0.0]
Intercept:  1.38998477082603 
Coefficients: [-1.2212950975133143,-0.04629705307010703,-0.6106473460416055]
Intercept:  -1.3090562599004436 
Coefficients: [0.25037290796442024,-0.14245373637849076,0.12518660440773915]
Intercept:  -4.570404108638901 
Coefficients: [0.8352178836198739,0.21341794704688208,0.41760894947769095]
Intercept:  -1.3090562599004434 
Coefficients: [0.25037290796442,-0.14245373637849076,0.1251866044077393]


In [131]:
[x.intercept for x in model.models]

AttributeError: 'LogisticRegressionModel' object has no attribute 'models'

In [177]:
BestModel.transform(Vectors.dense([1.0, 0.1, -1.0]))

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

# Databricks help session

*Feb. 6, 2020*

#### Can the DFPrep function distribute?
Yes, according to Spark UI

#### Can any function that is not a UDF distribute?
Yes, but check the Spark UI to make sure

#### Can For Loops distribute?
SAA

#### Do DB have any other sessions learned about pythonic things that won't distrubte in PySpark?



In [3]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
print("Tokenized:")
print(wordsData.show(1,False))

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
print("HashingTF")
print(featurizedData.show(1,False))
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
print("IDF")
print(rescaledData.show(1,False))

# rescaledData.select("label", "features").show()

Tokenized:
+-----+----------------------+----------------------------+
|label|sentence              |words                       |
+-----+----------------------+----------------------------+
|0.0  |Hi I heard about Spark|[hi, i, heard, about, spark]|
+-----+----------------------+----------------------------+
only showing top 1 row

None
HashingTF
+-----+----------------------+----------------------------+---------------------------------+
|label|sentence              |words                       |rawFeatures                      |
+-----+----------------------+----------------------------+---------------------------------+
|0.0  |Hi I heard about Spark|[hi, i, heard, about, spark]|(20,[0,5,9,17],[1.0,1.0,1.0,2.0])|
+-----+----------------------+----------------------------+---------------------------------+
only showing top 1 row

None
IDF
+-----+----------------------+----------------------------+---------------------------------+------------------------------------------------------

In [4]:
from pyspark.ml.feature import Word2Vec

# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [0.04547937586903572,-0.0025138825178146365,0.0006965592503547669]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [-0.0015820334000246864,-0.001963341608643532,0.009649708451304051]

Text: [Logistic, regression, models, are, neat] => 
Vector: [-0.04485992938280106,-0.04581879526376725,0.0262237292714417]



In [5]:
documentDF.printSchema()

root
 |-- text: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [8]:
documentDF.show(3,False)

+------------------------------------------+
|text                                      |
+------------------------------------------+
|[Hi, I, heard, about, Spark]              |
|[I, wish, Java, could, use, case, classes]|
|[Logistic, regression, models, are, neat] |
+------------------------------------------+



In [9]:
from pyspark.ml.feature import NGram

wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)

+------------------------------------------------------------------+
|ngrams                                                            |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



In [28]:
wordDataFrame.name = 'Whatevs'

print(wordDataFrame.name)

Whatevs


In [None]:
#use this to create an array... kind of pivot
from pyspark.sql.functions import collect_set
transactions = df.groupBy('seller_id').agg(collect_set('product_id').alias('items'))
transactions.show(5,False)

In [3]:
from pyspark.ml.fpm import PrefixSpan

df = spark.createDataFrame([
    (0, [[1, 2, 5,1],[3]]),
    (1, [[1, 2, 3, 5],[4]]),
    (2, [[1, 2],[5]])
], ["id", "sequence"])

prefixSpan = PrefixSpan(minSupport=0.5, maxPatternLength=5,
                        maxLocalProjDBSize=32000000)

# Find frequent sequential patterns.
prefixSpan.findFrequentSequentialPatterns(df).show()

+-----------+----+
|   sequence|freq|
+-----------+----+
|      [[3]]|   2|
|      [[2]]|   3|
|      [[5]]|   3|
|      [[1]]|   3|
|   [[2, 5]]|   2|
|   [[1, 5]]|   2|
|   [[1, 2]]|   3|
|[[1, 2, 5]]|   2|
+-----------+----+



In [None]:
df = spark.createDataFrame([
    (0, [1, 2, 5]),
    (1, [1, 2, 3, 5]),
    (2, [1, 2])
], ["id", "items"])

df.printSchema()
df.show()

from pyspark.ml.fpm import FPGrowth
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.2, minConfidence=0.1)
model = fpGrowth.fit(df)

In [2]:
# What data type does a tokenizer return?
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("sentence", "words")\
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

+-----------------------------------+------------------------------------------+------+
|sentence                           |words                                     |tokens|
+-----------------------------------+------------------------------------------+------+
|Hi I heard about Spark             |[hi, i, heard, about, spark]              |5     |
|I wish Java could use case classes |[i, wish, java, could, use, case, classes]|7     |
|Logistic,regression,models,are,neat|[logistic,regression,models,are,neat]     |1     |
+-----------------------------------+------------------------------------------+------+

+-----------------------------------+------------------------------------------+------+
|sentence                           |words                                     |tokens|
+-----------------------------------+------------------------------------------+------+
|Hi I heard about Spark             |[hi, i, heard, about, spark]              |5     |
|I wish Java could use case cla

In [3]:
regexTokenized.printSchema()

root
 |-- id: long (nullable = true)
 |-- sentence: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [4]:
from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (2.0, 0.0)
], ["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"],
                                 outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|
+--------------+--------------+-------------+-------------+

