### Analyse search terms on the e-commerce web server


#### Download the search term data set for the e-commerce web server and run analytic queries on it.


In [None]:
# Install spark

In [1]:
!pip install pyspark
!pip install findspark



In [None]:
# Start session

In [2]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Saving and Loading a SparkML Model").getOrCreate()

25/08/01 03:44:19 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/01 03:44:21 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
# Download The search term dataset from the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv

In [3]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

!wget -O searchterms.csv https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv


--2025-08-01 03:45:15--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104, 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 233457 (228K) [text/csv]
Saving to: ‘searchterms.csv’


2025-08-01 03:45:15 (49.3 MB/s) - ‘searchterms.csv’ saved [233457/233457]



In [None]:
# Load the csv into a spark dataframe

In [4]:
df = spark.read.csv("searchterms.csv", header=True, inferSchema=True)

# Show the first few rows
df.show(5)

+---+-----+----+--------------+
|day|month|year|    searchterm|
+---+-----+----+--------------+
| 12|   11|2021| mobile 6 inch|
| 12|   11|2021| mobile latest|
| 12|   11|2021|   tablet wifi|
| 12|   11|2021|laptop 14 inch|
| 12|   11|2021|     mobile 5g|
+---+-----+----+--------------+
only showing top 5 rows



In [5]:
# Number of rows
num_rows = df.count()

# Number of columns
num_columns = len(df.columns)

print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")


[Stage 3:>                                                          (0 + 1) / 1]

Number of rows: 10000
Number of columns: 4


                                                                                

In [6]:
# Print schema to see the datatype of each column
df.printSchema()


root
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- searchterm: string (nullable = true)



In [7]:
# Filter for 'gaming laptop' and count
count = df.filter(df.searchterm == "gaming laptop").count()

print(f"'gaming laptop' was searched {count} times.")


'gaming laptop' was searched 499 times.


In [8]:
from pyspark.sql.functions import desc

# Group by searchterm and count occurrences
top_terms = df.groupBy("searchterm") \
              .count() \
              .orderBy(desc("count")) \
              .limit(5)

# Show the top 5
top_terms.show()




+-------------+-----+
|   searchterm|count|
+-------------+-----+
|mobile 6 inch| 2312|
|    mobile 5g| 2301|
|mobile latest| 1327|
|       laptop|  935|
|  tablet wifi|  896|
+-------------+-----+



                                                                                

In [17]:
# The pretrained sales forecasting model is available at  the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz

from pyspark.ml.regression import LinearRegressionModel  # or another model type if known

model_path = "model_dir/sales_prediction.model"

# Load the model
model = LinearRegressionModel.load(model_path)




                                                                                

In [16]:
# Load the sales forecast model.

import os

model_path = "model_dir/sales_prediction.model"
print(os.listdir(model_path))


['data', 'metadata']


In [18]:
import urllib.request

url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz"
urllib.request.urlretrieve(url, "model.tar.gz")

import tarfile

with tarfile.open("model.tar.gz") as tar:
    tar.extractall("model_dir")

from pyspark.ml.regression import LinearRegressionModel  # or another model type if known

model_path = "model_dir/sales_prediction.model"

# Load the model
model = LinearRegressionModel.load(model_path)


In [1]:
# Using the sales forecast model, predict the sales for the year of 2023.


In [19]:
from pyspark.ml.linalg import Vectors

# Prepare input (modify as needed)
input_data = spark.createDataFrame([
    (Vectors.dense([2023]), )
], ["features"])

# Predict sales
predictions = model.transform(input_data)

# Show prediction
predictions.select("features", "prediction").show()


                                                                                

+--------+------------------+
|features|        prediction|
+--------+------------------+
|[2023.0]|175.16564294006457|
+--------+------------------+



25/08/01 03:57:09 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
25/08/01 03:57:09 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
