# **Iris Dataset**

In [None]:
!pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=15a26d81a9999ccef12d001be7fe6dfd4cc7f45f66161dce1a6b59ab0899e052
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create a SparkSession
spark = SparkSession.builder.appName("IrisDataset").getOrCreate()

# Load the dataset
iris_df = spark.read.csv("/content/Iris.csv", header=True, inferSchema=True)

# Convert the 'Species' column to numeric using StringIndexer
indexer = StringIndexer(inputCol="Species", outputCol="SpeciesIndex")
iris_df = indexer.fit(iris_df).transform(iris_df)

# Combine feature columns into a single feature vector
feature_columns = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
iris_df = assembler.transform(iris_df)

# Check if the features column exists
iris_df.printSchema()

# Show the final dataset with features
iris_df.select("features", "SpeciesIndex").show(5)

# Split the data into training and testing sets
train_df, test_df = iris_df.randomSplit([0.7, 0.3])

# Create a Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="SpeciesIndex")

# Train the model
lr_model = lr.fit(train_df)

# Make predictions on the test set
predictions = lr_model.transform(test_df)

# Show the predictions
predictions.select("features", "SpeciesIndex", "prediction").show(5)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="SpeciesIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy:.2f}")


root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)
 |-- SpeciesIndex: double (nullable = false)
 |-- features: vector (nullable = true)

+-----------------+------------+
|         features|SpeciesIndex|
+-----------------+------------+
|[5.1,3.5,1.4,0.2]|         0.0|
|[4.9,3.0,1.4,0.2]|         0.0|
|[4.7,3.2,1.3,0.2]|         0.0|
|[4.6,3.1,1.5,0.2]|         0.0|
|[5.0,3.6,1.4,0.2]|         0.0|
+-----------------+------------+
only showing top 5 rows

+-----------------+------------+----------+
|         features|SpeciesIndex|prediction|
+-----------------+------------+----------+
|[4.6,3.4,1.4,0.3]|         0.0|       0.0|
|[4.4,2.9,1.4,0.2]|         0.0|       0.0|
|[5.4,3.7,1.5,0.2]|         0.0|       0.0|
|[4.3,3.0,1.1,0.1]|         0.0|       0.0|
|[5.4,3.9,1.3,0.4]|        

# **Book Recommendation**

In [None]:
#importing the required pyspark library
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

#Setup Spark Session
spark = SparkSession.builder.appName('recommedation system').getOrCreate()

#CSV file can be downloaded from the link mentioned above.
data = spark.read.csv('book_ratings.csv',
					inferSchema=True,header=True)

data.show(5)



+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows



In [None]:

data.describe().show()


+-------+-----------------+------------------+------------------+
|summary|          book_id|           user_id|            rating|
+-------+-----------------+------------------+------------------+
|  count|           981756|            981756|            981756|
|   mean|4943.275635697668|25616.759933221696|3.8565335989797873|
| stddev|2873.207414896114|15228.338825882167|0.9839408559620033|
|    min|                1|                 1|                 1|
|    max|            10000|             53424|                 5|
+-------+-----------------+------------------+------------------+



In [None]:
# Dividing the data using random split into train_data and test_data
# in 80% and 20% respectively
train_data, test_data = data.randomSplit([0.8, 0.2])
# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5,
		regParam=0.01,
		userCol="user_id",
		itemCol="book_id",
		ratingCol="rating")

#Fitting the model on the train_data
model = als.fit(train_data)
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test_data)

#Displaying predictions calculated by the model
predictions.show()


+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|      1|   6630|     5|  4.483197|
|      1|  10140|     4| 3.6721985|
|      1|  13544|     5| 4.1506157|
|      1|  18361|     4| 4.5025406|
|      1|  21487|     4| 4.2023754|
|      1|  23576|     4| 3.7437992|
|      1|  24326|     5| 4.5372376|
|      1|  25164|     4| 3.9830015|
|      1|  28767|     5| 3.6874263|
|      1|  38475|     4| 4.1759586|
|      1|  51480|     1| 2.0493066|
|   6627|  30914|     4| 3.1556072|
|   6627|  34061|     5|   4.25294|
|   6628|    193|     5|  5.645776|
|   6628|   6218|     4|  2.904181|
|   6628|  34197|     5| 0.9282155|
|      1|   3662|     4| 4.7629504|
|      1|   9246|     1| 3.6527658|
|      1|  18031|     5|  4.397327|
|      1|  20076|     3| 3.6018338|
+-------+-------+------+----------+
only showing top 20 rows



In [None]:
#Printing and calculating RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + rmse))


Root-mean-square error = nan


In [None]:
#Filtering user with user id "5461" with book id on which it has given the reviews
user1 = test_data.filter(test_data['user_id']==5461).select(['book_id','user_id'])

#Displaying user1 data
user1.show()


+-------+-------+
|book_id|user_id|
+-------+-------+
|      2|   5461|
|      7|   5461|
|     14|   5461|
|     22|   5461|
|     31|   5461|
|     46|   5461|
|     48|   5461|
|     66|   5461|
|     80|   5461|
|    100|   5461|
|    115|   5461|
|    131|   5461|
|    142|   5461|
|    157|   5461|
|    181|   5461|
|    261|   5461|
|    273|   5461|
|    293|   5461|
|    321|   5461|
|    339|   5461|
+-------+-------+
only showing top 20 rows



In [None]:
#Traning and evaluating for user1 with our model trained with the help of training data
recommendations = model.transform(user1)

#Displaying the predictions of books for user1
recommendations.orderBy('prediction',ascending=False).show()


+-------+-------+----------+
|book_id|user_id|prediction|
+-------+-------+----------+
|    100|   5461|  4.865813|
|    357|   5461| 4.6411304|
|     48|   5461|  4.625593|
|    157|   5461|  4.607937|
|   1597|   5461| 4.5412784|
|    876|   5461| 4.5286922|
|     14|   5461|  4.511103|
|     80|   5461|  4.470908|
|      7|   5461|  4.417745|
|    115|   5461| 4.3871927|
|    293|   5461| 4.3870845|
|     31|   5461| 4.3255386|
|    131|   5461| 4.3050146|
|     46|   5461| 4.2703314|
|    401|   5461|  4.263187|
|    733|   5461|  4.255295|
|   9063|   5461| 4.2508397|
|     66|   5461|  4.246348|
|    321|   5461|  4.244096|
|    339|   5461|  4.183666|
+-------+-------+----------+
only showing top 20 rows



In [None]:
spark.stop()
