### Analyse search terms on the e-commerce web server


##### In this assignment you will download the search term data set for the e-commerce web server and run analytic queries on it.


In [None]:
# Install spark
# Install spark
!pip install pyspark
!pip install findspark

In [None]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, Normalizer, StandardScaler
from pyspark.ml.stat import Correlation
from pyspark.ml.regression import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions

In [None]:
# Start session
# Start session
sc = SparkContext()

spark = SparkSession \
    .builder \
    .appName("Saving and Loading a SparkML Model").getOrCreate()

In [None]:
spark

In [None]:
# Download The search term dataset from the below url

# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv

In [None]:
mydata = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv')
mydata.head()

In [None]:
# Load the csv into a spark dataframe

In [None]:
sdf = spark.createDataFrame(mydata)

In [None]:
# Print the number of rows and columns

In [None]:
col = len(sdf.columns)
row = sdf.count()
print("Columns : " ,col,"\n","Rows : ", row)

In [None]:
# Print the top 5 rows

In [None]:
sdf.head(5)

In [None]:
# Find out the datatype of the column searchterm?

In [None]:
sdf.printSchema()

In [None]:
# How many times was the term `gaming laptop` searched?

In [None]:
from pyspark.sql.functions import col

In [None]:
sdf.filter(col("searchterm").like("gaming laptop")).count()

In [None]:
# Print the top 5 most frequently used search terms?

In [None]:
sdf.groupBy("searchterm").count().orderBy(col("count").desc()).show(5)

In [None]:
# The pretrained sales forecasting model is available at  the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz

In [None]:
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz
# Extract downloded model
!tar -xvzf model.tar.gz

In [None]:
# Load the sales forecast model.

In [None]:
from pyspark.ml.regression import LinearRegressionModel

In [None]:
model = LinearRegressionModel.load('sales_prediction.model')

In [None]:
# Using the sales forecast model, predict the sales for the year of 2023.

In [None]:
def predict(year):
    assembler = VectorAssembler(inputCols=["year"],outputCol="features")
    data = [[year,0]]
    columns = ["year","sales"]
    _ = spark.createDataFrame(data, columns)
    __ = assembler.transform(_).select('features','year')
    predictions = model.transform(__)
    predictions.select('prediction').show()

In [None]:
predict(2023)