In [None]:
# Spark Decision Tree
# Install PySpark
# https://sparkbyexamples.com/pyspark/install-pyspark-in-anaconda-jupyter-notebook/

In [1]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark import SparkConf, SparkContext
from numpy import array

# ----------------------------------------------------------
# PCP 20230328
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# ----------------------------------------------------------


# Boilerplate Spark stuff:
conf = SparkConf().setMaster("local").setAppName("SparkDecisionTree")
sc = SparkContext(conf = conf)

# Some functions that convert our CSV input data into numerical
# features for each job candidate
def binary(YN):
    if (YN == 'Y'):
        return 1
    else:
        return 0

def mapEducation(degree):
    if (degree == 'BS'):
        return 1
    elif (degree =='MS'):
        return 2
    elif (degree == 'PhD'):
        return 3
    else:
        return 0

# Convert a list of raw fields from our CSV file to a
# LabeledPoint that MLLib can use. All data must be numerical...
def createLabeledPoints(fields):
    yearsExperience = int(fields[0])
    employed = binary(fields[1])
    previousEmployers = int(fields[2])
    educationLevel = mapEducation(fields[3])
    topTier = binary(fields[4])
    interned = binary(fields[5])
    hired = binary(fields[6])

    return LabeledPoint(hired, array([yearsExperience, employed,
        previousEmployers, educationLevel, topTier, interned]))

#Load up our CSV file, and filter out the header line with the column names
# rawData = sc.textFile("e:/sundog-consult/udemy/datascience/PastHires.csv")
# "C:\Users\pcpow\OneDrive\Desktop\DataScience_Udemy_20230321\DataScience\DataScience-Python3\PastHires.csv"
rawData = sc.textFile("C:/Users/pcpow/OneDrive/Desktop/DataScience_Udemy_20230321/DataScience/DataScience-Python3/PastHires.csv")


header = rawData.first()
rawData = rawData.filter(lambda x:x != header)

# Split each line into a list based on the comma delimiters
csvData = rawData.map(lambda x: x.split(","))

# Convert these lists to LabeledPoints
trainingData = csvData.map(createLabeledPoints)

# Create a test candidate, with 10 years of experience, currently employed,
# 3 previous employers, a BS degree, but from a non-top-tier school where
# he or she did not do an internship. You could of course load up a whole
# huge RDD of test candidates from disk, too.
testCandidates = [ array([10, 1, 3, 1, 0, 0])]
testData = sc.parallelize(testCandidates)

# Train our DecisionTree classifier using our data set
model = DecisionTree.trainClassifier(trainingData, numClasses=2,
                                     categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2},
                                     impurity='gini', maxDepth=5, maxBins=32)

# Now get predictions for our unknown candidates. (Note, you could separate
# the source data into a training set and a test set while tuning
# parameters and measure accuracy as you go!)
predictions = model.predict(testData)
print('Hire prediction:')
results = predictions.collect()
for result in results:
    print(result)

# We can also print out the decision tree itself:
print('Learned classification tree model:')
print(model.toDebugString())


Hire prediction:
1.0
Learned classification tree model:
DecisionTreeModel classifier of depth 4 with 9 nodes
  If (feature 1 in {0.0})
   If (feature 5 in {0.0})
    If (feature 0 <= 0.5)
     If (feature 3 in {1.0})
      Predict: 0.0
     Else (feature 3 not in {1.0})
      Predict: 1.0
    Else (feature 0 > 0.5)
     Predict: 0.0
   Else (feature 5 not in {0.0})
    Predict: 1.0
  Else (feature 1 not in {0.0})
   Predict: 1.0



In [8]:
import findspark
findspark.init()
import pyspark
import random
sc = pyspark.SparkContext(appName="Pi")
num_samples = 100000000
def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)
sc.stop()

3.14164644


In [9]:
count

78541161

In [3]:
from platform import python_version

print(python_version())

3.7.4


In [None]:
from platform import python_version

print(python_version())

In [5]:
import os

# Set spark environments
# os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
# os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
print (os.environ['PYSPARK_PYTHON'])
print(os.environ['PYSPARK_DRIVER_PYTHON'])

KeyError: 'PYSPARK_PYTHON'

In [6]:
python --version


NameError: name 'python' is not defined

In [7]:
conda search python


Loading channels: ...working... done
# Name                       Version           Build  Channel             
python                        2.7.13     h1b6d89f_16  pkgs/main           
python                        2.7.13     h9912b81_15  pkgs/main           
python                        2.7.13     hb034564_12  pkgs/main           
python                        2.7.14     h2765ee6_18  pkgs/main           
python                        2.7.14     h3e68818_15  pkgs/main           
python                        2.7.14     h4084c39_22  pkgs/main           
python                        2.7.14     h4a10d90_30  pkgs/main           
python                        2.7.14     h4a10d90_31  pkgs/main           
python                        2.7.14     h59f5a59_20  pkgs/main           
python                        2.7.14     h819644d_16  pkgs/main           
python                        2.7.14     h8c3f1cb_23  pkgs/main           
python                        2.7.15      h2880e7c_2  pkgs/main

In [None]:
conda install python=3.10.10



In [None]:
from platform import python_version

print(python_version())