# **Installation**








In [61]:
#Installation
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [62]:
#spark
import findspark
findspark.init()

In [63]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NumericalProcessing").getOrCreate()

# **Load Dataset**

In [64]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
data = spark.read.csv("/content/drive/MyDrive/Study/Big Data & IOT/mobile_recommendation_system_dataset.csv", header=True, inferSchema=True)
data.show()

+--------------------+-------+-------+--------------------+--------------------+
|                name|ratings|  price|              imgURL|              corpus|
+--------------------+-------+-------+--------------------+--------------------+
|REDMI Note 12 Pro...|    4.2|  23999|https://rukminim2...|Storage128 GBRAM6...|
|OPPO F11 Pro (Aur...|    4.5|₹20,999|https://rukminim2...|Storage128 GBRAM6...|
|REDMI Note 11 (St...|    4.2|  13149|https://rukminim2...|Storage64 GBRAM4 ...|
|OnePlus Nord CE 5...|    4.1|  21999|https://rukminim2...|Storage256 GBRAM1...|
|APPLE iPhone 13 m...|    4.6|   3537|https://rukminim2...|Storage128  Syste...|
|APPLE iPhone 6s P...|    4.5|   5537|https://rukminim2...|Storage64  System...|
|REDMI 10 Power (S...|    4.2|  18996|https://rukminim2...|Storage128 GBRAM8...|
|Infinix Hot 11 (E...|    4.3|   9999|https://rukminim2...|Storage64 GBRAM4 ...|
|Infinix Note 10 P...|    4.2|  16999|https://rukminim2...|Storage256 GBRAM8...|
|SAMSUNG Galaxy A0...|    4.

Column details

In [68]:
colNames = data.columns
colNames

['name', 'ratings', 'price', 'imgURL', 'corpus']

In [69]:
#Printing information about each column
data.printSchema()

root
 |-- name: string (nullable = true)
 |-- ratings: string (nullable = true)
 |-- price: string (nullable = true)
 |-- imgURL: string (nullable = true)
 |-- corpus: string (nullable = true)



In [70]:
data.dtypes

[('name', 'string'),
 ('ratings', 'string'),
 ('price', 'string'),
 ('imgURL', 'string'),
 ('corpus', 'string')]

Drop a column

In [71]:
data=data.drop("imgURl")

In [72]:
data.show()

+--------------------+-------+-------+--------------------+
|                name|ratings|  price|              corpus|
+--------------------+-------+-------+--------------------+
|REDMI Note 12 Pro...|    4.2|  23999|Storage128 GBRAM6...|
|OPPO F11 Pro (Aur...|    4.5|₹20,999|Storage128 GBRAM6...|
|REDMI Note 11 (St...|    4.2|  13149|Storage64 GBRAM4 ...|
|OnePlus Nord CE 5...|    4.1|  21999|Storage256 GBRAM1...|
|APPLE iPhone 13 m...|    4.6|   3537|Storage128  Syste...|
|APPLE iPhone 6s P...|    4.5|   5537|Storage64  System...|
|REDMI 10 Power (S...|    4.2|  18996|Storage128 GBRAM8...|
|Infinix Hot 11 (E...|    4.3|   9999|Storage64 GBRAM4 ...|
|Infinix Note 10 P...|    4.2|  16999|Storage256 GBRAM8...|
|SAMSUNG Galaxy A0...|    4.0|  12999|Storage128 GBRAM4...|
|vivo Z1Pro (Sonic...|    4.5|  15990|Storage64 GBRAM4 ...|
|Moto C Plus (Pear...|    4.2|   6999|Storage16 GBRAM2 ...|
|OPPO A15 (Dynamic...|    4.3|₹10,990|Storage32 GBRAM3 ...|
|SAMSUNG Galaxy Z ...|    3.7|  24463|St

Renamed a column

In [73]:
data = data.withColumnRenamed('corpus', 'Details')
data.show()

+--------------------+-------+-------+--------------------+
|                name|ratings|  price|             Details|
+--------------------+-------+-------+--------------------+
|REDMI Note 12 Pro...|    4.2|  23999|Storage128 GBRAM6...|
|OPPO F11 Pro (Aur...|    4.5|₹20,999|Storage128 GBRAM6...|
|REDMI Note 11 (St...|    4.2|  13149|Storage64 GBRAM4 ...|
|OnePlus Nord CE 5...|    4.1|  21999|Storage256 GBRAM1...|
|APPLE iPhone 13 m...|    4.6|   3537|Storage128  Syste...|
|APPLE iPhone 6s P...|    4.5|   5537|Storage64  System...|
|REDMI 10 Power (S...|    4.2|  18996|Storage128 GBRAM8...|
|Infinix Hot 11 (E...|    4.3|   9999|Storage64 GBRAM4 ...|
|Infinix Note 10 P...|    4.2|  16999|Storage256 GBRAM8...|
|SAMSUNG Galaxy A0...|    4.0|  12999|Storage128 GBRAM4...|
|vivo Z1Pro (Sonic...|    4.5|  15990|Storage64 GBRAM4 ...|
|Moto C Plus (Pear...|    4.2|   6999|Storage16 GBRAM2 ...|
|OPPO A15 (Dynamic...|    4.3|₹10,990|Storage32 GBRAM3 ...|
|SAMSUNG Galaxy Z ...|    3.7|  24463|St

Get the length of a column

In [74]:
from pyspark.sql.functions import length

In [75]:
data=data.withColumn("length",length(data["Details"]))

In [76]:
data.show()

+--------------------+-------+-------+--------------------+------+
|                name|ratings|  price|             Details|length|
+--------------------+-------+-------+--------------------+------+
|REDMI Note 12 Pro...|    4.2|  23999|Storage128 GBRAM6...|   421|
|OPPO F11 Pro (Aur...|    4.5|₹20,999|Storage128 GBRAM6...|   677|
|REDMI Note 11 (St...|    4.2|  13149|Storage64 GBRAM4 ...|   133|
|OnePlus Nord CE 5...|    4.1|  21999|Storage256 GBRAM1...|   232|
|APPLE iPhone 13 m...|    4.6|   3537|Storage128  Syste...|   602|
|APPLE iPhone 6s P...|    4.5|   5537|Storage64  System...|   496|
|REDMI 10 Power (S...|    4.2|  18996|Storage128 GBRAM8...|   133|
|Infinix Hot 11 (E...|    4.3|   9999|Storage64 GBRAM4 ...|   384|
|Infinix Note 10 P...|    4.2|  16999|Storage256 GBRAM8...|   525|
|SAMSUNG Galaxy A0...|    4.0|  12999|Storage128 GBRAM4...|   331|
|vivo Z1Pro (Sonic...|    4.5|  15990|Storage64 GBRAM4 ...|   371|
|Moto C Plus (Pear...|    4.2|   6999|Storage16 GBRAM2 ...|   

Selected Column show

In [77]:
data.select('name', 'Details').show(10)

+--------------------+--------------------+
|                name|             Details|
+--------------------+--------------------+
|REDMI Note 12 Pro...|Storage128 GBRAM6...|
|OPPO F11 Pro (Aur...|Storage128 GBRAM6...|
|REDMI Note 11 (St...|Storage64 GBRAM4 ...|
|OnePlus Nord CE 5...|Storage256 GBRAM1...|
|APPLE iPhone 13 m...|Storage128  Syste...|
|APPLE iPhone 6s P...|Storage64  System...|
|REDMI 10 Power (S...|Storage128 GBRAM8...|
|Infinix Hot 11 (E...|Storage64 GBRAM4 ...|
|Infinix Note 10 P...|Storage256 GBRAM8...|
|SAMSUNG Galaxy A0...|Storage128 GBRAM4...|
+--------------------+--------------------+
only showing top 10 rows



In [78]:
data.select('name', 'price').show()

+--------------------+-------+
|                name|  price|
+--------------------+-------+
|REDMI Note 12 Pro...|  23999|
|OPPO F11 Pro (Aur...|₹20,999|
|REDMI Note 11 (St...|  13149|
|OnePlus Nord CE 5...|  21999|
|APPLE iPhone 13 m...|   3537|
|APPLE iPhone 6s P...|   5537|
|REDMI 10 Power (S...|  18996|
|Infinix Hot 11 (E...|   9999|
|Infinix Note 10 P...|  16999|
|SAMSUNG Galaxy A0...|  12999|
|vivo Z1Pro (Sonic...|  15990|
|Moto C Plus (Pear...|   6999|
|OPPO A15 (Dynamic...|₹10,990|
|SAMSUNG Galaxy Z ...|  24463|
|realme GT 2 Pro (...|   7537|
|OnePlus 9 Pro 5G ...|  15537|
|APPLE iPhone 14 P...|  24463|
|                A10E|   1299|
|Xiaomi 11T Pro 5G...|  31999|
|REDMI 10 Prime 20...|  10990|
+--------------------+-------+
only showing top 20 rows



In [79]:
data.select('name','ratings', 'price').show(10)

+--------------------+-------+-------+
|                name|ratings|  price|
+--------------------+-------+-------+
|REDMI Note 12 Pro...|    4.2|  23999|
|OPPO F11 Pro (Aur...|    4.5|₹20,999|
|REDMI Note 11 (St...|    4.2|  13149|
|OnePlus Nord CE 5...|    4.1|  21999|
|APPLE iPhone 13 m...|    4.6|   3537|
|APPLE iPhone 6s P...|    4.5|   5537|
|REDMI 10 Power (S...|    4.2|  18996|
|Infinix Hot 11 (E...|    4.3|   9999|
|Infinix Note 10 P...|    4.2|  16999|
|SAMSUNG Galaxy A0...|    4.0|  12999|
+--------------------+-------+-------+
only showing top 10 rows



Null Value

In [80]:
#Cheking for null values
for col in data.columns:
    print(col.ljust(10), data.filter(data[col].isNull()).count())

name       0
ratings    31
price      39
Details    51
length     51


In [81]:
#Null Value drop

data.na.drop().show()

+--------------------+-------+-------+--------------------+------+
|                name|ratings|  price|             Details|length|
+--------------------+-------+-------+--------------------+------+
|REDMI Note 12 Pro...|    4.2|  23999|Storage128 GBRAM6...|   421|
|OPPO F11 Pro (Aur...|    4.5|₹20,999|Storage128 GBRAM6...|   677|
|REDMI Note 11 (St...|    4.2|  13149|Storage64 GBRAM4 ...|   133|
|OnePlus Nord CE 5...|    4.1|  21999|Storage256 GBRAM1...|   232|
|APPLE iPhone 13 m...|    4.6|   3537|Storage128  Syste...|   602|
|APPLE iPhone 6s P...|    4.5|   5537|Storage64  System...|   496|
|REDMI 10 Power (S...|    4.2|  18996|Storage128 GBRAM8...|   133|
|Infinix Hot 11 (E...|    4.3|   9999|Storage64 GBRAM4 ...|   384|
|Infinix Note 10 P...|    4.2|  16999|Storage256 GBRAM8...|   525|
|SAMSUNG Galaxy A0...|    4.0|  12999|Storage128 GBRAM4...|   331|
|vivo Z1Pro (Sonic...|    4.5|  15990|Storage64 GBRAM4 ...|   371|
|Moto C Plus (Pear...|    4.2|   6999|Storage16 GBRAM2 ...|   

Data Count

In [82]:
data.count(), len(data.columns)

(2593, 5)

In [83]:
data.select("name").distinct().show()

+--------------------+
|                name|
+--------------------+
|vivo T2x 5G (Auro...|
|APPLE iPhone 8 (S...|
|MOTOROLA g22 (Cos...|
|SAMSUNG Galaxy M3...|
|APPLE iPhone 7 (R...|
|Redmi 8A (Midnigh...|
|SAMSUNG Galaxy M1...|
|OPPO A83 (2018 Ed...|
|vivo X50 (Frost B...|
|MOTOROLA g13 (Mat...|
|APPLE iPhone 14 P...|
|SAMSUNG Galaxy M1...|
|APPLE iPhone X (S...|
|        Motorola a50|
|APPLE iPhone 7 (B...|
|OPPO F17 Pro (Met...|
|vivo V25 pro (Pur...|
|APPLE iPhone 5s (...|
|realme C15 (Power...|
|vivo T2x 5G (Mari...|
+--------------------+
only showing top 20 rows



In [84]:
data.select("ratings").distinct().show()

+--------------------+
|             ratings|
+--------------------+
|                 3.1|
|            256 GB)"|
|                 4.2|
|                 4.4|
|                 3.8|
|                 2.9|
|                 4.5|
|                null|
|                 3.3|
|                 4.3|
|                 3.5|
|                 4.8|
|                 4.1|
|                 4.6|
|                 5.0|
|             YouTube|
|                 4.0|
| our next-generat...|
|                 3.6|
|    an astonishing 1|
+--------------------+
only showing top 20 rows



**categorical** **column**

In [85]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

# StringIndexer is similar to labelencoder which gives a label to each category
# OneHotEncoder created onehot encoding vector
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# VectorAssembler is used to create vector from the features. MOdeling takes vector as an input
from pyspark.ml.feature import VectorAssembler

# DecisionTreeClassifier is used for classiication problems
from pyspark.ml.classification import DecisionTreeClassifier

In [86]:
# Create a categorical column for explanation purpose
data = data.withColumn("price", F.when(F.col("price") > 25000, "High").otherwise("Low"))
data.show(5, truncate=False)

data.groupby("price").count().show(), data.select("ratings").distinct().show()

+-----------------------------------------+-------+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|name                                     |ratings|price|Details                                                                                                                                                                                                 

(None, None)

In [87]:
(train_df, test_df) = data.randomSplit([0.8, 0.2], 11)
print("Number of train samples: " + str(train_df.count()))
print("Number of test samples: " + str(test_df.count()))

Number of train samples: 2086
Number of test samples: 507


# **Tokenization**  

In [88]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="name", outputCol="Tokenized_Name")
tokenized_data = tokenizer.transform(data)
tokenized_data.show()

+--------------------+-------+-----+--------------------+------+--------------------+
|                name|ratings|price|             Details|length|      Tokenized_Name|
+--------------------+-------+-----+--------------------+------+--------------------+
|REDMI Note 12 Pro...|    4.2|  Low|Storage128 GBRAM6...|   421|[redmi, note, 12,...|
|OPPO F11 Pro (Aur...|    4.5|  Low|Storage128 GBRAM6...|   677|[oppo, f11, pro, ...|
|REDMI Note 11 (St...|    4.2|  Low|Storage64 GBRAM4 ...|   133|[redmi, note, 11,...|
|OnePlus Nord CE 5...|    4.1|  Low|Storage256 GBRAM1...|   232|[oneplus, nord, c...|
|APPLE iPhone 13 m...|    4.6|  Low|Storage128  Syste...|   602|[apple, iphone, 1...|
|APPLE iPhone 6s P...|    4.5|  Low|Storage64  System...|   496|[apple, iphone, 6...|
|REDMI 10 Power (S...|    4.2|  Low|Storage128 GBRAM8...|   133|[redmi, 10, power...|
|Infinix Hot 11 (E...|    4.3|  Low|Storage64 GBRAM4 ...|   384|[infinix, hot, 11...|
|Infinix Note 10 P...|    4.2|  Low|Storage256 GBRAM8.

**Stop** **word** **removal**

In [89]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="Tokenized_Name", outputCol="StopWordsRemover_Name")
filtered_data = remover.transform(tokenized_data)
filtered_data.show()

+--------------------+-------+-----+--------------------+------+--------------------+---------------------+
|                name|ratings|price|             Details|length|      Tokenized_Name|StopWordsRemover_Name|
+--------------------+-------+-----+--------------------+------+--------------------+---------------------+
|REDMI Note 12 Pro...|    4.2|  Low|Storage128 GBRAM6...|   421|[redmi, note, 12,...| [redmi, note, 12,...|
|OPPO F11 Pro (Aur...|    4.5|  Low|Storage128 GBRAM6...|   677|[oppo, f11, pro, ...| [oppo, f11, pro, ...|
|REDMI Note 11 (St...|    4.2|  Low|Storage64 GBRAM4 ...|   133|[redmi, note, 11,...| [redmi, note, 11,...|
|OnePlus Nord CE 5...|    4.1|  Low|Storage256 GBRAM1...|   232|[oneplus, nord, c...| [oneplus, nord, c...|
|APPLE iPhone 13 m...|    4.6|  Low|Storage128  Syste...|   602|[apple, iphone, 1...| [apple, iphone, 1...|
|APPLE iPhone 6s P...|    4.5|  Low|Storage64  System...|   496|[apple, iphone, 6...| [apple, iphone, 6...|
|REDMI 10 Power (S...|    4.

**Text Cleaning**

In [90]:
from pyspark.sql.functions import regexp_replace, lower, col

cleaned_data = filtered_data.withColumn("Cleaned_Name", lower(regexp_replace(col("name"), "[^a-zA-Z\\s]", "")))

cleaned_data.show()

+--------------------+-------+-----+--------------------+------+--------------------+---------------------+--------------------+
|                name|ratings|price|             Details|length|      Tokenized_Name|StopWordsRemover_Name|        Cleaned_Name|
+--------------------+-------+-----+--------------------+------+--------------------+---------------------+--------------------+
|REDMI Note 12 Pro...|    4.2|  Low|Storage128 GBRAM6...|   421|[redmi, note, 12,...| [redmi, note, 12,...|redmi note  pro g...|
|OPPO F11 Pro (Aur...|    4.5|  Low|Storage128 GBRAM6...|   677|[oppo, f11, pro, ...| [oppo, f11, pro, ...|oppo f pro aurora...|
|REDMI Note 11 (St...|    4.2|  Low|Storage64 GBRAM4 ...|   133|[redmi, note, 11,...| [redmi, note, 11,...|redmi note  starb...|
|OnePlus Nord CE 5...|    4.1|  Low|Storage256 GBRAM1...|   232|[oneplus, nord, c...| [oneplus, nord, c...|oneplus nord ce g...|
|APPLE iPhone 13 m...|    4.6|  Low|Storage128  Syste...|   602|[apple, iphone, 1...| [apple, iph

# **Text** **Analysis**

In [91]:
from pyspark.sql.functions import length

data_with_length = cleaned_data.withColumn("text_length", length(col("name")))
avg_length = data_with_length.agg({"text_length": "avg"}).collect()[0][0]
avg_length

35.73736984188199

In [92]:
from pyspark.sql.functions import length

data_with_length = cleaned_data.withColumn("text_length", length(col("cleaned_Name")))
avg_length = data_with_length.agg({"text_length": "avg"}).collect()[0][0]
avg_length

28.316621673736986

In [93]:
#Converting dataset into pandas to observe the data in a more formatted way
data.limit(5580).toPandas()

Unnamed: 0,name,ratings,price,Details,length
0,"REDMI Note 12 Pro 5G (Onyx Black, 128 GB)",4.2,Low,Storage128 GBRAM6 SystemAndroid 12Processor T...,421.0
1,"OPPO F11 Pro (Aurora Green, 128 GB)",4.5,Low,Storage128 GBRAM6 GBExpandable Storage256GB S...,677.0
2,"REDMI Note 11 (Starburst White, 64 GB)",4.2,Low,Storage64 GBRAM4 SystemAndroid 11Processor Sp...,133.0
3,"OnePlus Nord CE 5G (Blue Void, 256 GB)",4.1,Low,Storage256 GBRAM12 SystemAndroid Q 11Processo...,232.0
4,"APPLE iPhone 13 mini (Blue, 128 GB)",4.6,Low,Storage128 SystemiOS 15Processor TypeA15 Bion...,602.0
...,...,...,...,...,...
2588,"REDMI Note 9 (Shadow Black, 64 GB)",4.3,Low,Storage64 GBRAM4 GBExpandable Storage512GB Sy...,576.0
2589,"OnePlus 9 5G (Astral Black, 128 GB)",3.9,High,Storage128 GBRAM8 SystemAndroid 11Processor S...,134.0
2590,Motorola a50,4.1,Low,,
2591,"SAMSUNG Galaxy S22 Ultra 5G (Phantom Black, 25...",4.3,Low,Storage256 GBRAM12 SystemAndroid 12Processor ...,301.0


In [94]:
#To know the shape of the dataframe
print('Number of rows: \t', data.count())
print('Number of columns: \t', len(data.columns))

Number of rows: 	 2593
Number of columns: 	 5


In [95]:
import pyspark.sql.functions as F

# Replace the values of 0,1 in the "Category" column
data2 = data.withColumn("ratings", F.when(data["ratings"] == "rate", 0).otherwise(1))

In [96]:
data2.show()

+--------------------+-------+-----+--------------------+------+
|                name|ratings|price|             Details|length|
+--------------------+-------+-----+--------------------+------+
|REDMI Note 12 Pro...|      1|  Low|Storage128 GBRAM6...|   421|
|OPPO F11 Pro (Aur...|      1|  Low|Storage128 GBRAM6...|   677|
|REDMI Note 11 (St...|      1|  Low|Storage64 GBRAM4 ...|   133|
|OnePlus Nord CE 5...|      1|  Low|Storage256 GBRAM1...|   232|
|APPLE iPhone 13 m...|      1|  Low|Storage128  Syste...|   602|
|APPLE iPhone 6s P...|      1|  Low|Storage64  System...|   496|
|REDMI 10 Power (S...|      1|  Low|Storage128 GBRAM8...|   133|
|Infinix Hot 11 (E...|      1|  Low|Storage64 GBRAM4 ...|   384|
|Infinix Note 10 P...|      1|  Low|Storage256 GBRAM8...|   525|
|SAMSUNG Galaxy A0...|      1|  Low|Storage128 GBRAM4...|   331|
|vivo Z1Pro (Sonic...|      1|  Low|Storage64 GBRAM4 ...|   371|
|Moto C Plus (Pear...|      1|  Low|Storage16 GBRAM2 ...|   261|
|OPPO A15 (Dynamic...|   

In [97]:
data3 = data2.limit(10).toPandas()
data3

Unnamed: 0,name,ratings,price,Details,length
0,"REDMI Note 12 Pro 5G (Onyx Black, 128 GB)",1,Low,Storage128 GBRAM6 SystemAndroid 12Processor T...,421
1,"OPPO F11 Pro (Aurora Green, 128 GB)",1,Low,Storage128 GBRAM6 GBExpandable Storage256GB S...,677
2,"REDMI Note 11 (Starburst White, 64 GB)",1,Low,Storage64 GBRAM4 SystemAndroid 11Processor Sp...,133
3,"OnePlus Nord CE 5G (Blue Void, 256 GB)",1,Low,Storage256 GBRAM12 SystemAndroid Q 11Processo...,232
4,"APPLE iPhone 13 mini (Blue, 128 GB)",1,Low,Storage128 SystemiOS 15Processor TypeA15 Bion...,602
5,"APPLE iPhone 6s Plus (Gold, 64 GB)",1,Low,Storage64 SystemiOS 9Processor TypeA9 Chip 12...,496
6,"REDMI 10 Power (Sporty Orange, 128 GB)",1,Low,Storage128 GBRAM8 SystemAndroid 13Processor S...,133
7,"Infinix Hot 11 (Emerald Green, 64 GB)",1,Low,Storage64 GBRAM4 GBExpandable Storage256GB Sy...,384
8,"Infinix Note 10 Pro (Nordic Secret, 256 GB)",1,Low,Storage256 GBRAM8 GBExpandable Storage256GB S...,525
9,"SAMSUNG Galaxy A04 (Green, 128 GB)",1,Low,Storage128 GBRAM4 GBTotal Memory128GB SystemA...,331


In [98]:
data2.withColumn("length",length(data["Details"])).show()

+--------------------+-------+-----+--------------------+------+
|                name|ratings|price|             Details|length|
+--------------------+-------+-----+--------------------+------+
|REDMI Note 12 Pro...|      1|  Low|Storage128 GBRAM6...|   421|
|OPPO F11 Pro (Aur...|      1|  Low|Storage128 GBRAM6...|   677|
|REDMI Note 11 (St...|      1|  Low|Storage64 GBRAM4 ...|   133|
|OnePlus Nord CE 5...|      1|  Low|Storage256 GBRAM1...|   232|
|APPLE iPhone 13 m...|      1|  Low|Storage128  Syste...|   602|
|APPLE iPhone 6s P...|      1|  Low|Storage64  System...|   496|
|REDMI 10 Power (S...|      1|  Low|Storage128 GBRAM8...|   133|
|Infinix Hot 11 (E...|      1|  Low|Storage64 GBRAM4 ...|   384|
|Infinix Note 10 P...|      1|  Low|Storage256 GBRAM8...|   525|
|SAMSUNG Galaxy A0...|      1|  Low|Storage128 GBRAM4...|   331|
|vivo Z1Pro (Sonic...|      1|  Low|Storage64 GBRAM4 ...|   371|
|Moto C Plus (Pear...|      1|  Low|Storage16 GBRAM2 ...|   261|
|OPPO A15 (Dynamic...|   

In [99]:
data2.groupBy('price').mean().show(10)

+-----+------------+-----------------+
|price|avg(ratings)|      avg(length)|
+-----+------------+-----------------+
| High|         1.0|370.7035175879397|
|  Low|         1.0|368.9202425373134|
+-----+------------+-----------------+



In [100]:
data2.groupBy('name').mean().show(10)

+--------------------+------------+-----------+
|                name|avg(ratings)|avg(length)|
+--------------------+------------+-----------+
|vivo T2x 5G (Auro...|         1.0|      239.0|
|APPLE iPhone 8 (S...|         1.0|      570.0|
|MOTOROLA g22 (Cos...|         1.0|      373.0|
|SAMSUNG Galaxy M3...|         1.0|      193.5|
|APPLE iPhone 7 (R...|         1.0|      490.0|
|Redmi 8A (Midnigh...|         1.0|      342.0|
|SAMSUNG Galaxy M1...|         1.0|      289.0|
|OPPO A83 (2018 Ed...|         1.0|      515.0|
|vivo X50 (Frost B...|         1.0|      410.0|
|MOTOROLA g13 (Mat...|         1.0|      395.0|
+--------------------+------------+-----------+
only showing top 10 rows



In [101]:
data.describe().show()

+-------+--------------------+--------------------+-----+--------------------+------------------+
|summary|                name|             ratings|price|             Details|            length|
+-------+--------------------+--------------------+-----+--------------------+------------------+
|  count|                2593|                2562| 2593|                2542|              2542|
|   mean|                null|   4.295951257861616| null|                null|369.19944925255703|
| stddev|                null| 0.21470544066435493| null|                null| 164.2804853734887|
|    min|"OPPO F23 5G | 64...| 102.0 cm2 (~82.2...| High| and more true-to...|                50|
|    max|x 1080Resolution ...|                 5.0|  Low|https://rukminim2...|              1031|
+-------+--------------------+--------------------+-----+--------------------+------------------+



In [102]:
filter_res=data.filter(col("ratings")>4.5)
filter_res.show()

+--------------------+-------+-----+--------------------+------+
|                name|ratings|price|             Details|length|
+--------------------+-------+-----+--------------------+------+
|APPLE iPhone 13 m...|    4.6|  Low|Storage128  Syste...|   602|
|APPLE iPhone 14 P...|    4.6|  Low|Storage256  Syste...|   523|
|OPPO Reno10 5G (I...|    4.6| High|Storage256 System...|   432|
|APPLE iPhone 14 P...|    4.7|  Low|Storage256  Syste...|   557|
|               a 10e|    4.6|  Low|Storage100 KBRAM1...|   151|
|APPLE iPhone 14 P...|    4.7| High|Storage512  Syste...|   557|
|APPLE iPhone 13 P...|    4.6|  Low|Storage1024  Syst...|   671|
|APPLE iPhone 14 P...|    4.7|  Low|Storage128  Syste...|   557|
|APPLE iPhone 12 (...|    4.6|  Low|Storage128  Syste...|   594|
|APPLE iPhone SE (...|    4.7| High|Storage128  Syste...|   490|
|APPLE iPhone XS (...|    4.6| High|Storage256  Syste...|   696|
|APPLE iPhone 5C (...|    4.7|  Low|Storage16  System...|   320|
|APPLE iPhone SE 3...|   

In [103]:
filter_res = data.filter(col("price")>30000)
filter_res.show()

+----+-------+-----+-------+------+
|name|ratings|price|Details|length|
+----+-------+-----+-------+------+
+----+-------+-----+-------+------+



In [104]:
filter_res = data.filter(col("price")<30000)
filter_res.show()

+----+-------+-----+-------+------+
|name|ratings|price|Details|length|
+----+-------+-----+-------+------+
+----+-------+-----+-------+------+



In [105]:
filter_res = data.filter((col("price")<=38000) &
                         (col("price")>=32000))
filter_res.show(10)

+----+-------+-----+-------+------+
|name|ratings|price|Details|length|
+----+-------+-----+-------+------+
+----+-------+-----+-------+------+



# **Prepare** **Data**

In [106]:
from sklearn.model_selection import train_test_split
# Assuming 'cleaned_data' is PySpark DataFrame with a 'cleaned_Name' column
pandas_df = cleaned_data.select('cleaned_Name', 'ratings').toPandas()#convert to pandas

# Extracting features (X) and labels (Y)
X = pandas_df['cleaned_Name']
Y = pandas_df['ratings']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [107]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [108]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
lrc = LogisticRegression(solver='liblinear', penalty='l1')
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

In [109]:
clfs = {
    'SVC' : svc,
    'KN' : knc,
    'NB': mnb,
    'LR': lrc,
    'ETC': etc,
}

In [None]:
#SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score

def train_classifier(clf, X_train, Y_train, X_test, Y_test):
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    clf.fit(X_train_tfidf, Y_train)
    y_pred = clf.predict(X_test_tfidf)

    accuracy = accuracy_score(Y_test, y_pred)
    precision = precision_score(Y_test, y_pred)

    return accuracy, precision

# Assuming 'svc' is your Support Vector Classifier instance
svc = SVC()

accuracy, precision = train_classifier(svc, X_train, Y_train, X_test, Y_test)
print("Accuracy:", accuracy)
print("Precision:", precision)

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():

    current_accuracy,current_precision = train_classifier(clf, X_train, Y_train, X_test, Y_test)

    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [111]:
from pyspark.sql.functions import avg, sum, max
avg_res = data.select(avg("ratings")).first()[0]
sum_res = data.select(sum("ratings")).first()[0]
max_res = data.select(max("ratings")).first()[0]
print(avg_res, sum_res, max_res)

4.295951257861616 10928.899999999952 5.0


In [112]:
from pyspark.sql.functions import avg, sum, max
avg_res = data.select(avg("price")).first()[0]
sum_res = data.select(sum("price")).first()[0]
max_res = data.select(max("price")).first()[0]
print(avg_res, sum_res, max_res)

None None Low


In [113]:
from pyspark.sql.functions import mean, stddev
mean_res = data.select(mean("price")).first()[0]
std_res = data.select(stddev("price")).first()[0]
print(mean_res, std_res)

None None


In [58]:
sorted_by_price = data.orderBy("price")
sorted_by_price.show(5)

+--------------------+-------+-----+--------------------+------+
|                name|ratings|price|             Details|length|
+--------------------+-------+-----+--------------------+------+
|Xiaomi 11T Pro 5G...|    4.0| High|Storage256 GBRAM8...|   147|
|OPPO Reno10 5G (S...|    4.2| High|Storage256 GBRAM8...|   440|
|OnePlus 9R 5G (Ca...|    3.7| High|Storage128 GBRAM8...|   137|
|OPPO Reno10 5G (I...|    4.6| High|Storage256 System...|   432|
|OnePlus 9RT 5G (N...|    4.3| High|Storage128 GBRAM8...|   134|
+--------------------+-------+-----+--------------------+------+
only showing top 5 rows



In [59]:
import joblib

# Assuming 'clf' is your trained StackingClassifier
# Also, 'Spam_Email_Classification_model.pkl' is the name you choose for your saved model file
joblib.dump(clf, 'Mobile Recommendation System.pkl')

['Mobile Recommendation System.pkl']