## Sara Khosravi
###############################################################################################################################

In [1]:
#set environment
import os
import sys
 
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/local/anaconda/bin/python" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/local/anaconda/bin/python"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

In [2]:
#import Sparksession driver
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Classification of Star Type") \
    .getOrCreate()

In [3]:
df = spark.read.csv('data/StarType.csv',inferSchema=True, header=True)
df.show()

+---+-----------+--------------------+-------------------+------------------+---------+----------+--------------+
|_c0|temperature|          luminosity|             radius|absolute_magnitude|star_type|star_color|spectral_class|
+---+-----------+--------------------+-------------------+------------------+---------+----------+--------------+
|  0|       3068|              0.0024|               0.17|             16.12|        0|        10|             5|
|  1|       3042|              5.0E-4|             0.1542|              16.6|        0|        10|             5|
|  2|       2600|              3.0E-4|              0.102|              18.7|        0|        10|             5|
|  3|       2800|              2.0E-4|               0.16|             16.65|        0|        10|             5|
|  4|       1939|             1.38E-4|0.10300000000000001|             20.06|        0|        10|             5|
|  5|       2840|              6.5E-4|               0.11|             16.98|        0| 

In [4]:
#renaming the columns
#df = df.toDF('Temperature', 'Luminosity', 'Radius', 'Absolute magnitud', 'Star type', 'Star color', 'Spectral Class')

In [5]:
df.show(5)

+---+-----------+----------+-------------------+------------------+---------+----------+--------------+
|_c0|temperature|luminosity|             radius|absolute_magnitude|star_type|star_color|spectral_class|
+---+-----------+----------+-------------------+------------------+---------+----------+--------------+
|  0|       3068|    0.0024|               0.17|             16.12|        0|        10|             5|
|  1|       3042|    5.0E-4|             0.1542|              16.6|        0|        10|             5|
|  2|       2600|    3.0E-4|              0.102|              18.7|        0|        10|             5|
|  3|       2800|    2.0E-4|               0.16|             16.65|        0|        10|             5|
|  4|       1939|   1.38E-4|0.10300000000000001|             20.06|        0|        10|             5|
+---+-----------+----------+-------------------+------------------+---------+----------+--------------+
only showing top 5 rows



In [6]:
#Check for missing values
for col in df.columns:
    print("no. of cells in column", col, "with null values:", df.filter(df[col].isNull()).count())

no. of cells in column _c0 with null values: 0
no. of cells in column temperature with null values: 0
no. of cells in column luminosity with null values: 0
no. of cells in column radius with null values: 0
no. of cells in column absolute_magnitude with null values: 0
no. of cells in column star_type with null values: 0
no. of cells in column star_color with null values: 0
no. of cells in column spectral_class with null values: 0


In [8]:
#all the independent variables need to be packed into one column of vector type
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['temperature','luminosity','radius','absolute_magnitude','star_color','spectral_class'], 
                            outputCol="features")
feature_vec=assembler.transform(df).select('features','star_type')
feature_vec.show(5)

+--------------------+---------+
|            features|star_type|
+--------------------+---------+
|[3068.0,0.0024,0....|        0|
|[3042.0,5.0E-4,0....|        0|
|[2600.0,3.0E-4,0....|        0|
|[2800.0,2.0E-4,0....|        0|
|[1939.0,1.38E-4,0...|        0|
+--------------------+---------+
only showing top 5 rows



In [9]:
#Count of target classes
feature_vec.groupBy('star_type').count().show()
#there is not data imbalance

+---------+-----+
|star_type|count|
+---------+-----+
|        1|   40|
|        3|   40|
|        5|   40|
|        4|   40|
|        2|   40|
|        0|   40|
+---------+-----+



In [10]:
# Split the data into train and test sets
train_data, test_data = feature_vec.randomSplit([.75,.25],seed=0)

In [11]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="star_type", featuresCol="features",  
                        maxIter=100, regParam=0.0001, family="multinomial",  
                        elasticNetParam=0.0)

# Train model with Training Data
lrModel = lr.fit(train_data)
predictions = lrModel.transform(test_data)
predictions.printSchema()

root
 |-- features: vector (nullable = true)
 |-- star_type: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [12]:
predictions.select('star_type','prediction').show()

+---------+----------+
|star_type|prediction|
+---------+----------+
|        0|       0.0|
|        1|       1.0|
|        1|       1.0|
|        0|       0.0|
|        0|       0.0|
|        0|       0.0|
|        0|       0.0|
|        0|       0.0|
|        1|       1.0|
|        1|       1.0|
|        1|       1.0|
|        0|       0.0|
|        0|       0.0|
|        4|       4.0|
|        0|       0.0|
|        1|       1.0|
|        5|       5.0|
|        0|       0.0|
|        5|       5.0|
|        5|       5.0|
+---------+----------+
only showing top 20 rows



In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='star_type', metricName='accuracy')
evaluator.evaluate(predictions)

1.0

In [14]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='star_type', metricName='f1')
evaluator.evaluate(predictions)

1.0

In [15]:
predictions.groupBy('star_type').count().show()

+---------+-----+
|star_type|count|
+---------+-----+
|        1|    6|
|        3|    3|
|        5|   12|
|        4|    7|
|        2|   14|
|        0|   10|
+---------+-----+



In [21]:
#Grid Search
from pyspark.ml.classification import RandomForestClassifier
#Grid Search
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
rf = RandomForestClassifier( labelCol='star_type',seed=0)
paramGrid = (ParamGridBuilder()\
             .addGrid(rf.maxDepth,[10,11,12])\
             .addGrid(rf.numTrees,[20,30,40])\
             .build())

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='star_type', metricName='f1')
# Create 4-fold CrossValidator
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4)

cvModel = cv.fit(train_data)

In [22]:
list(zip(cvModel.avgMetrics, cvModel.getEstimatorParamMaps()))

[(0.9761535708787319,
  {Param(parent='RandomForestClassifier_4ec886f4fbb6325606f3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,
   Param(parent='RandomForestClassifier_4ec886f4fbb6325606f3', name='numTrees', doc='Number of trees to train (>= 1).'): 20}),
 (0.9761535708787319,
  {Param(parent='RandomForestClassifier_4ec886f4fbb6325606f3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,
   Param(parent='RandomForestClassifier_4ec886f4fbb6325606f3', name='numTrees', doc='Number of trees to train (>= 1).'): 30}),
 (0.9761535708787319,
  {Param(parent='RandomForestClassifier_4ec886f4fbb6325606f3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,
   Param(parent='RandomForestClassifier_4ec886f4fbb6325606f3', na

In [23]:
#Best Model Params
score_params_list = list(zip(cvModel.avgMetrics, cvModel.getEstimatorParamMaps()))
max(score_params_list,key=lambda item:item[0])

(0.9761535708787319,
 {Param(parent='RandomForestClassifier_4ec886f4fbb6325606f3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 10,
  Param(parent='RandomForestClassifier_4ec886f4fbb6325606f3', name='numTrees', doc='Number of trees to train (>= 1).'): 20})

In [24]:
predictions = cvModel.bestModel.transform(test_data)


In [25]:
evaluator.evaluate(predictions)

0.9615384615384615

In [None]:
# BY implementiong Random Forest Model, we got 96% accuracy.
#BY implementing LOGESTIC Regression Model, 100% accuracy  is gotten.