<a href="https://colab.research.google.com/github/theRubyPheonix/AttentiveFP/blob/proxyzzz/Prog4BigData_Lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Apache Spark Machine Learning using Dataframes in Google Colab**

In [54]:
# Setup apache instance in Google Colab 
# Latest version 3.3.2 from https://archive.apache.org/dist/spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop2.tgz
!tar xf spark-3.3.2-bin-hadoop2.tgz
!pip install -q findspark 

In [55]:
# Set the java and spark environment location 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop2"

In [56]:
# Local spark session 
import findspark
findspark.init()

In [57]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config("spark.ui.port","4050")\
        .getOrCreate()
spark

In [58]:
# Getting tthe Iris dataset
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data -O sample_data/iris.data

--2023-03-20 23:27:31--  https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4551 (4.4K) [application/x-httpd-php]
Saving to: ‘sample_data/iris.data’


2023-03-20 23:27:32 (171 MB/s) - ‘sample_data/iris.data’ saved [4551/4551]



In [60]:
# Import the dataset into dataframe 
df = spark.read.csv('sample_data/iris.data', inferSchema=True).toDF('SepalLength','SepalWidth','PetalLength','PetalWidth','Class')

# Feature Extraction

In [61]:
df.show()

+-----------+----------+-----------+----------+-----------+
|SepalLength|SepalWidth|PetalLength|PetalWidth|      Class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
|        5.4|       3.9|        1.7|       0.4|Iris-setosa|
|        4.6|       3.4|        1.4|       0.3|Iris-setosa|
|        5.0|       3.4|        1.5|       0.2|Iris-setosa|
|        4.4|       2.9|        1.4|       0.2|Iris-setosa|
|        4.9|       3.1|        1.5|       0.1|Iris-setosa|
|        5.4|       3.7|        1.5|       0.2|Iris-setosa|
|        4.8|       3.4|        1.6|       0.2|Iris-setosa|
|        4.8|       3.0|        1.4|       0.1|Iris-setosa|
|        4.3|       3.0|        1.1|    

In [63]:
df_temp = df_temp.drop('SepalLength','SepalWidth','PetalLength','PetalWidth')
df_temp.show()

+-----------+-----------------+
|      Class|         features|
+-----------+-----------------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|
|Iris-setosa|[4.9,3.0,1.4,0.2]|
|Iris-setosa|[4.7,3.2,1.3,0.2]|
|Iris-setosa|[4.6,3.1,1.5,0.2]|
|Iris-setosa|[5.0,3.6,1.4,0.2]|
|Iris-setosa|[5.4,3.9,1.7,0.4]|
|Iris-setosa|[4.6,3.4,1.4,0.3]|
|Iris-setosa|[5.0,3.4,1.5,0.2]|
|Iris-setosa|[4.4,2.9,1.4,0.2]|
|Iris-setosa|[4.9,3.1,1.5,0.1]|
|Iris-setosa|[5.4,3.7,1.5,0.2]|
|Iris-setosa|[4.8,3.4,1.6,0.2]|
|Iris-setosa|[4.8,3.0,1.4,0.1]|
|Iris-setosa|[4.3,3.0,1.1,0.1]|
|Iris-setosa|[5.8,4.0,1.2,0.2]|
|Iris-setosa|[5.7,4.4,1.5,0.4]|
|Iris-setosa|[5.4,3.9,1.3,0.4]|
|Iris-setosa|[5.1,3.5,1.4,0.3]|
|Iris-setosa|[5.7,3.8,1.7,0.3]|
|Iris-setosa|[5.1,3.8,1.5,0.3]|
+-----------+-----------------+
only showing top 20 rows



In [64]:
# The final data preparation step is to index the Class column - to use numeric rather than text values 
# - run the following command and display your output of Class, features & ClassIndex columns
from pyspark.ml.feature import StringIndexer
l_indexer=StringIndexer(inputCol='Class',outputCol='ClassIndex')
df_temp=l_indexer.fit(df_temp).transform(df_temp)
df_temp.show()

+-----------+-----------------+----------+
|      Class|         features|ClassIndex|
+-----------+-----------------+----------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|
|Iris-setosa|[4.9,3.0,1.4,0.2]|       0.0|
|Iris-setosa|[4.7,3.2,1.3,0.2]|       0.0|
|Iris-setosa|[4.6,3.1,1.5,0.2]|       0.0|
|Iris-setosa|[5.0,3.6,1.4,0.2]|       0.0|
|Iris-setosa|[5.4,3.9,1.7,0.4]|       0.0|
|Iris-setosa|[4.6,3.4,1.4,0.3]|       0.0|
|Iris-setosa|[5.0,3.4,1.5,0.2]|       0.0|
|Iris-setosa|[4.4,2.9,1.4,0.2]|       0.0|
|Iris-setosa|[4.9,3.1,1.5,0.1]|       0.0|
|Iris-setosa|[5.4,3.7,1.5,0.2]|       0.0|
|Iris-setosa|[4.8,3.4,1.6,0.2]|       0.0|
|Iris-setosa|[4.8,3.0,1.4,0.1]|       0.0|
|Iris-setosa|[4.3,3.0,1.1,0.1]|       0.0|
|Iris-setosa|[5.8,4.0,1.2,0.2]|       0.0|
|Iris-setosa|[5.7,4.4,1.5,0.4]|       0.0|
|Iris-setosa|[5.4,3.9,1.3,0.4]|       0.0|
|Iris-setosa|[5.1,3.5,1.4,0.3]|       0.0|
|Iris-setosa|[5.7,3.8,1.7,0.3]|       0.0|
|Iris-setosa|[5.1,3.8,1.5,0.3]|       0.0|
+----------

In [62]:
# Spark ML can only deal with one features column - so we need to vectorise the multiple columns into one:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler(\
                                   inputCols=['SepalLength','SepalWidth','PetalLength','PetalWidth'],\
                                   outputCol='features')
df_temp=vector_assembler.transform(df)
df_temp.show(3)

+-----------+----------+-----------+----------+-----------+-----------------+
|SepalLength|SepalWidth|PetalLength|PetalWidth|      Class|         features|
+-----------+----------+-----------+----------+-----------+-----------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
+-----------+----------+-----------+----------+-----------+-----------------+
only showing top 3 rows



# Split Traning and Test

In [65]:
# Spliting the data into traning and test dataset
(trainingData, testData) = df_temp.randomSplit([0.7,0.3])

# Decision Tree 

In [66]:
# Deision Tree Classifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

dt=DecisionTreeClassifier(labelCol='ClassIndex',featuresCol='features')
model=dt.fit(trainingData)

In [67]:
# Test your model with your test dataset: 
predictions=model.transform(testData)
predictions.select('prediction','ClassIndex').show(15)

+----------+----------+
|prediction|ClassIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 15 rows



In [71]:
# Evaluation fn
evaluator=MulticlassClassificationEvaluator(labelCol='ClassIndex',predictionCol='prediction',metricName='accuracy')
accuracy=evaluator.evaluate(predictions)
print('Test Error = %g'%(1.0-accuracy))
print('Test Set Accuracy =' + str(accuracy))

Test Error = 0.0208333
Test Set Accuracy =0.9791666666666666


# Random Forest

In [74]:
# Random Forest Classifier
from pyspark.ml.classification import RandomForestClassifier

rf=RandomForestClassifier(labelCol='ClassIndex',featuresCol='features',numTrees=10)
model=rf.fit(trainingData)
predictions=model.transform(testData)
predictions.select('prediction','ClassIndex').show()

Exception ignored in: <function JavaWrapper.__del__ at 0x7fc244746820>
Traceback (most recent call last):
  File "/content/spark-3.3.2-bin-hadoop2/python/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'RandomForestClassifier' object has no attribute '_java_obj'


+----------+----------+
|prediction|ClassIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 20 rows



In [76]:
# Evaluation Fn
evaluator=MulticlassClassificationEvaluator(labelCol='ClassIndex',predictionCol='prediction',metricName='accuracy')
accuracy=evaluator.evaluate(predictions)
print('Test Error = %g' % (1.0-accuracy))
print('Test Set Accuracy = '+ str(accuracy))

Test Error = 0.0208333
Test Set Accuracy = 0.9791666666666666


# Naive bayes Classifier

In [77]:
# Naive Base Classifier
from pyspark.ml.classification import NaiveBayes

nb=NaiveBayes(labelCol='ClassIndex',featuresCol='features',smoothing=1.0,modelType='multinomial')
model=nb.fit(trainingData)
predictions=model.transform(testData)
predictions.select('Class','ClassIndex','probability','prediction').show()

+-----------+----------+--------------------+----------+
|      Class|ClassIndex|         probability|prediction|
+-----------+----------+--------------------+----------+
|Iris-setosa|       0.0|[0.65854431190304...|       0.0|
|Iris-setosa|       0.0|[0.59601222050315...|       0.0|
|Iris-setosa|       0.0|[0.62416333661554...|       0.0|
|Iris-setosa|       0.0|[0.58353510118447...|       0.0|
|Iris-setosa|       0.0|[0.73347005973151...|       0.0|
|Iris-setosa|       0.0|[0.63851500041302...|       0.0|
|Iris-setosa|       0.0|[0.58445564581932...|       0.0|
|Iris-setosa|       0.0|[0.56430482772553...|       0.0|
|Iris-setosa|       0.0|[0.56269476579405...|       0.0|
|Iris-setosa|       0.0|[0.63074902380229...|       0.0|
|Iris-setosa|       0.0|[0.66959617354636...|       0.0|
|Iris-setosa|       0.0|[0.64492936285987...|       0.0|
|Iris-setosa|       0.0|[0.50846896200676...|       0.0|
|Iris-setosa|       0.0|[0.49918864496678...|       0.0|
|Iris-setosa|       0.0|[0.6496

In [78]:
# Evaluation Fn
evaluator=MulticlassClassificationEvaluator(labelCol='ClassIndex',predictionCol='prediction',metricName='accuracy')
accuracy=evaluator.evaluate(predictions)
print('Test Error = %g' % (1.0-accuracy))
print('Test Set Accuracy = '+ str(accuracy))

Test Error = 0.145833
Test Set Accuracy = 0.8541666666666666


# Protein Data


In [82]:
!wget https://raw.githubusercontent.com/OpenDrugAI/AttentiveFP/master/data/SAMPL.csv -O sample_data/SAMPL.csv

--2023-03-21 00:45:28--  https://raw.githubusercontent.com/OpenDrugAI/AttentiveFP/master/data/SAMPL.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 32060 (31K) [text/plain]
Saving to: ‘sample_data/SAMPL.csv’


2023-03-21 00:45:28 (114 MB/s) - ‘sample_data/SAMPL.csv’ saved [32060/32060]



In [113]:
df = spark.read.option('delimiter',',').option('header',True).csv('sample_data/SAMPL.csv')

In [114]:
df.show()

+--------------------+--------------------+------+------+
|               iupac|              smiles|  expt|  calc|
+--------------------+--------------------+------+------+
|4-methoxy-N,N-dim...|CN(C)C(=O)c1ccc(c...|-11.01|-9.625|
|methanesulfonyl c...|        CS(=O)(=O)Cl| -4.87|-6.219|
|   3-methylbut-1-ene|            CC(C)C=C|  1.83| 2.452|
|     2-ethylpyrazine|          CCc1cnccn1| -5.45|-5.809|
|         heptan-1-ol|            CCCCCCCO| -4.21|-2.917|
|  3,5-dimethylphenol|     Cc1cc(cc(c1)O)C| -6.27|-5.444|
|  2,3-dimethylbutane|          CC(C)C(C)C|  2.34| 2.468|
| 2-methylpentan-2-ol|         CCCC(C)(C)O| -3.92|-2.779|
|1,2-dimethylcyclo...|C[C@@H]1CCCC[C@@H]1C|  1.58| 1.685|
|          butan-2-ol|         CC[C@H](C)O| -4.62|-3.145|
|      dibromomethane|             C(Br)Br| -1.96|-0.405|
| 2-methylpentan-3-ol|     CC[C@H](C(C)C)O| -3.88|-2.416|
|     2-ethylpyridine|          CCc1ccccn1| -4.33| -3.31|
|    ethyl pentanoate|        CCCCC(=O)OCC| -2.49| -3.11|
|        benze