<a href="https://colab.research.google.com/github/OliverRevilla/BigData_Pyspark/blob/main/ML_Pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Classification**

##**Data Preparation**

**Drop Columns**


In [None]:
# Either drop the columns you don't want
cars = df.drop('maker','model')

**Filtering out missing data**

In [None]:
# How many missing values?
cars.filter('cyl IS NULL').count()

# Drop records with missing values in the cylinders column
cars = cars.filter('cyl IS NOT NULL')

# Drop records with missing values in any column 
cars = cars.dropna(how = 'All', subset = 'col1') 

**Mutating columns**

In [None]:
from pyspark.sql.functions import round
cars = cars.withColumn('mass',round(cars.weight/2.205,0))
#-------------------------------------------------------
from pyspark.sql.functions import round

# Convert 'mile' to 'km' and drop 'mile' column
flights_km = flights.withColumn('km',round(flights['mile']*1.60934,0))\
                    .drop('mile')

# Create 'label' column indicating whether flight delayed (1) or not (0)
flights_km = flights_km.withColumn('label', (flights_km['delay'] >= 15).cast('integer'))

# Check first five records
flights_km.show(5)

**Indexing categorical data**

In [None]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol = 'type',
                        outputCol = 'type_idx')

# Classification
# Assign index values to strings
# During the setting process it identify all values of the column and assing one indexer to each value
indexer = indexer.fit(cars)

# Then the model creates a new column with all values of indexes
cars = indexer.transform(cars)

# If is neccesary sort the indexes use stringOrderType
# Import the required function
# -----------------------------------------------------------------------------------
from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol= 'carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the other categorical feature
flights_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)


**Assembling columns**

In [None]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = ['cyl','size'], outputCol = 'features')
# In features columns join all predictors of the model
assembler.transform(cars)

In [None]:
# ---------------------------------------------------------------------------------------
# Remove the 'flight' column
flights_drop_column = flights.drop('flight')

# Number of records with missing 'delay' values
flights_drop_column.filter('delay IS NULL').count()

# Remove records with missing 'delay' values
flights_valid_delay = flights_drop_column.filter('delay IS NOT NULL')

# Remove records with missing values in any column and get the number of remaining rows
flights_none_missing = flights_valid_delay.dropna()
print(flights_none_missing.count())
# ----------------------------------------------------------------------------------------
# Import the necessary class
from pyspark.ml.feature import VectorAssembler

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon','dom','dow','carrier_idx','org_idx','km','depart','duration'
], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)

# Check the resulting column
flights_assembled.select('features', 'delay').show(5, truncate=False)

## **Decision Tree Model**

**Train/Test Split**

In [None]:
# Specify a seed for reproducibility
cars_train, cars_test = cars.randomSplit([0.8,0.2], seed = 23)
#------------------------------------------------------------------------
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8,0.2], seed = 17)

# Check that training set has around 80% of records
training_ratio = flights_train.count() / flights.count()
print(training_ratio)

 **Buil a Decision Tree model**

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

model_algorithm = DecisionTreeClassifier()
tree_model = model_algorithm.fit(cars_train)
prediction = tree_model.transform(cars_test)
#------------------------------------------------------------
# Import the Decision Tree Classifier class
from pyspark.ml.classification import DecisionTreeClassifier

# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier()
tree_model = tree.fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(flights_test)
prediction.select('label', 'prediction', 'probability').show(5, False)

**Confusion matrix**

In [None]:
# Is a table which describes performance of a model on testing data
prediction.groupBy('label','prediction').count().show()
#-------------------------------------------------------------------
# Create a confusion matrix
prediction.groupBy('label', 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label <> prediction').count()
FP = prediction.filter('prediction = 1 AND label <> prediction').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP)/(TN + TP + FN + FP)
print(accuracy)

## **Logistic Regression Model**

In [None]:
from pyspark.ml.classification import LogisticRegression

# Create a logistic Regression Classifier
logistic = LogisticRegression()

# Learn from the training data
logistic = logistic.fit(cars_train)

# Predictions
prediction = logistic.transform(cars_test)
#--------------------------------------------------------------
# Import the logistic regression class
from pyspark.ml.classification import LogisticRegression

# Create a classifier object and train on training data
logistic = LogisticRegression().fit(flights_train)

# Create predictions for the testing data and show confusion matrix
prediction = logistic.transform(flights_test)
prediction.groupBy('label', 'prediction').count().show()


In [None]:
# Precision and recall
# Precision
TP/(TP + FP)
# Recall
TP/(TP + FN)

# Weighted metrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(prediction,{evaluator,metricName: 'weightedPrecision'})

## **Turning Text Into tables**

In [None]:
# Removing punctuation
from pyspark.sql.functions import regexp_replace

# Regular expression (REGEX) to match commas and hyphens
REGEX = '[,\\-]'

books = books.withColumn('text',regexp_replace(books.text,REGEX,''))

# Text to tokens
from pyspark.ml.feature import Tokenizer
books = Tokenizer(inputCol = 'text',outputCol = "tokens").transform(books)

# Stop words
from pyspark.ml.feature import StopWordsRemover
stopwords = StopWordsRemover()
stopwords.getStopWords()

# Removing stop words
# Specify the input and output column names
stopwords = stopwords.setInputCol('tokens').setOutputCol('words')
books = stopwords.transform(books)

# Feature hashing
from pyspark.ml.feature import HashingTF
hasher = HashingTF(inputCol = 'words',outputCol = 'hash',numFeatures = 32)
books = hasher.transform(books)

# Dealing with common words
from pyspark.ml.feature import IDF
books = IDF(inputCol = 'hash', outputCol = 'features').fit(books).transform(books)

#--------------------------------------------------------------------------------
# Import the necessary functions
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

# Remove punctuation (REGEX provided) and numbers
wrangled = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text,'[0-9]', ' '))

# Merge multiple spaces
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' '))

# Split the text into words
wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled)

wrangled.show(4, truncate=False)

from pyspark.ml.feature import StopWordsRemover,HashingTF,IDF

# Remove stop words.
wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\
      .transform(sms)

# Apply the hashing trick
wrangled = HashingTF(inputCol = 'terms', outputCol = 'hash', numFeatures=1024)\
      .transform(wrangled)

# Convert hashed symbols to TF-IDF
tf_idf = IDF(inputCol = 'hash', outputCol = 'features')\
      .fit(wrangled).transform(wrangled)
      
tf_idf.select('terms', 'features').show(4, truncate=False)
# Model

# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8,0.2], seed = 13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2).fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy('label','prediction').count().show()

## **Regression**

## **One-Hot Encoding**

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator

onehot = OneHotEncoderEstimator(inputCols = ['type_idx'], outputCols = ['type_dummy'])

# Fit the encoder to the data
onehot = onehot.fit(cars)

# How many category levels?
onehot.categorySizes

cars = onehot.transform(cars)
cars.select('type','typer_idx','type_dummy').distinct().sort('type_idx').show()

# Dense versus sparse
from pyspark.mllib.linalg import DenseVector, SparseVector

# --------------------------------------------------------------------------------
# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoderEstimator

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols =  ['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()


## **Regression**

In [None]:
from pyspark.ml.regression import LinearRegression

regression = LinearRegression(labelCol = 'consuption')

regression = regression.fit(cars_train)
predictions = regression.transform(cars_test)

# Examine intercept
regression.intercept

# Examine Coefficients
regression.coefficients


# Calculate RMSE
from pyspark.ml.evaluation import RegressionEvalkuator

RegressionEvaluator(labelCol = 'consumption').evaluate(predictions, {evaluator.metricName: "r2"})
#---------------------------------------------------------------------------------------------------
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol = 'duration').fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(flights_test)
predictions.select('duration', 'prediction').show(5, False)

# Calculate the RMSE
RegressionEvaluator(labelCol = 'duration').evaluate(predictions, {RegressionEvaluator(labelCol = 'duration').metricName: 'mae'})

# Intercept (average minutes on ground)
inter = regression.intercept
print(inter)

# Coefficients
coefs = regression.coefficients
print(coefs)

# Average minutes per km
minutes_per_km = regression.coefficients[0]
print(minutes_per_km)

# Average speed in km per hour
avg_speed =  60/ minutes_per_km

print(avg_speed)
 


## **Bucketing**

In [None]:
from pyspark.ml.feature import Bucketizer

bucketizer = Bucketizer(splits = [3500,4500,6000,6500],
                        inputCol = 'rpm',
                        outpulCol = 'rpm_bin')
# Applying buckets
cars = bucketizer.transform(cars)

#------------------------------------------------------------------------
from pyspark.ml.feature import Bucketizer, OneHotEncoderEstimator

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[0,3,6,9,12,15,18,21,24], inputCol = 'depart', outputCol = 'depart_bucket')

# Bucket the departure times
bucketed = buckets.transform(flights)
bucketed.select('depart','depart_bucket').show(5)

# Create a one-hot encoder
onehot = OneHotEncoderEstimator(inputCols = ['depart_bucket'], outputCols = ['depart_dummy'])

# One-hot encode the bucketed departure times
flights_onehot = onehot.fit(bucketed).transform(bucketed)
flights_onehot.select('depart','depart_bucket','depart_dummy').show(5)

# Find the RMSE on testing data
from pyspark.ml.evaluation import RegressionEvaluator
RegressionEvaluator(labelCol = 'duration').evaluate(predictions)

# Average minutes on ground at OGG for flights departing between 21:00 and 24:00
avg_eve_ogg = regression.intercept
print(avg_eve_ogg)

# Average minutes on ground at OGG for flights departing between 00:00 and 03:00
avg_night_ogg = regression.intercept + regression.coefficients[8]
print(avg_night_ogg)

# Average minutes on ground at JFK for flights departing between 00:00 and 03:00
avg_night_jfk = regression.intercept + regression.coefficients[3] + regression.coefficients[8]

print(avg_night_jfk)

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit Lasso model (α = 1) to training data
regression = LinearRegression(labelCol = 'duration', regParam = 1, elasticNetParam=1).fit(flights_train)

predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol = 'duration').evaluate(regression.transform(flights_test))
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)   

# Number of zero coefficients
zero_coeff = sum([beta == 0 for beta in regression.coefficients])
print("Number of coefficients equal to 0:", zero_coeff)

## **Ensembles & Pipelines**

## **Pipeline**

In [None]:
from pyspark.ml import Pipeline

# Create a Pipeline to specify all stages of the process
pipeline = Pipeline(stages = [indexes,onehot,assemble,regression])

# Only is necessary call to fit() method to canalize the train set 
pipeline = pipeline.fit(cars_train)

# Transform() method to make predictions
pipeline = pipeline.transform(cars_test)

# Access to each stage of pipeline
pipeline.stage[3] 
pipeline.stage[3].intercept
pipeline.stage[3]-coefficients

#----------------------------------------------------------------------------
# Convert categorical strings to index values
indexer = StringIndexer(inputCol = 'org', outputCol = 'org_idx')

# One-hot encode index values
onehot = OneHotEncoderEstimator(
    inputCols= ['org_idx','dow'],
    outputCols= ['org_dummy','dow_dummy']
)

# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km','org_dummy','dow_dummy'], outputCol= 'features')

# A linear regression object
regression = LinearRegression(labelCol='duration')

# Import class for creating a pipeline
from pyspark.ml import Pipeline

# Construct a pipeline
pipeline = Pipeline(stages=[indexer,onehot,assembler,regression])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the testing data
predictions = pipeline.transform(flights_test)

#### Pipeline of logistic regression
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol= tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol= remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol= hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])




## **Cross-Validation**

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# A grid of parameter values (empty for the moment)
params = ParamGridBuilder().build()

# Cross-Validation object
cv = CrossValidator(estimator = regression,
                    estimatorParamMaps = params,
                    evaluator = evaluator,
                    numFolds = 10,
                    seed = 13)
cv = cv.fit(cars_train)

# RMSE
cv.avgMetrics

evaluator.evaluate(cv.transform(cars_test))

# ---------------------------------------------------------------
# Create an empty parameter grid
params = ParamGridBuilder().build()

# Create objects for building and evaluating a regression model
regression = LinearRegression(labelCol = 'duration')
evaluator = RegressionEvaluator(labelCol = 'duration')

# Create a cross validator
cv = CrossValidator(estimator= regression, estimatorParamMaps= params, evaluator= evaluator, numFolds = 5)

# Train and test model on multiple folds of the training data
cv = cv.fit(flights_train)

# NOTE: Since cross-valdiation builds multiple models, the fit() method can take a little while to complete.

# Create an indexer for the org field
indexer = StringIndexer(inputCol = 'org', outputCol = 'org_idx')

# Create an one-hot encoder for the indexed org field
onehot = OneHotEncoderEstimator(inputCols = ['org_idx'], outputCols = ['org_dummy'])

# Assemble the km and one-hot encoded fields
assembler = VectorAssembler(inputCols = ['km','org_dummy'], outputCol = 'features')

# Create a pipeline and cross-validator.
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])
cv = CrossValidator(estimator= pipeline,
          estimatorParamMaps= params,
          evaluator = evaluator)

## **Grid Search**

In [None]:
from pyspark.ml.tuning import ParamGridBuilder

# Create a parameter grid builder
params = ParamGridBuilder()
# Add grid points
params = params.addGrid(regression.fitIntercept, [True, False])
# Construct the grid: This is useful because it allow build a model for each point in the grid
params = params.build()

# How many models?
print('Number of models to be tested:', len(params))

cv = CrossValidator( estimator = regression,
                    estimatorParamMaps = params,
                    evaluator = evaluator)
cv = cv.setNumFolds(10).setSeed(13).fit(cars_train)
cv.avgMetrics

# Access the best model
cv.bestModel

# Predictions
predictions = cv.transform(cars_test)

# Retrieve the best parameter
cv.bestModel.explainParam('fitIntercept')


### More complicated Grid
params = ParamGridBuilder() \
                .addGrid(regression.fitIntercept, [True, False]) \
                .addGrid(regression.regParam, [0.001, 0.01,0.1,1,10]) \
                .addGrid(regression.elasticNetParam, [0,0.25,0.5,0.75,1]) \
                .build()

# How many models now?
print('Number of models to be tested:', len(params))

###
# Create parameter grid
params = ParamGridBuilder()

# Add grids for two parameters
params = params.addGrid(regression.regParam,[0.01,0.1,1.0,10.0]) \
               .addGrid(regression.elasticNetParam,[0.0,0.5,1.0])

# Build the parameter grid
params = params.build()
print('Number of models to be tested: ', len(params))

# Create cross-validator
cv = CrossValidator(estimator= pipeline, estimatorParamMaps= params, evaluator= evaluator, numFolds = 5)




