In [1]:
#Imports
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.util import MLUtils
from pyspark.sql.types import *
from pyspark.sql.functions import *

#Create a SparkSession for accessing all the spark functionalities
spark = SparkSession \
    .builder \
    .appName("CIS 5570 Project") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()


In [2]:
#Schema to specify the type and name of all the attributes
listSchema = StructType([
    StructField("id", IntegerType(), True),
    StructField("lastScraped", DateType(), True),
    StructField("summary", BooleanType(), True),
    StructField("spaceDesc", BooleanType(), True),
    StructField("desc", BooleanType(), True),
    StructField("nbhoodOverview", BooleanType(), True),
    StructField("notes", BooleanType(), True),
    StructField("transit", BooleanType(), True),
    StructField("acess", BooleanType(), True),
    StructField("interact", BooleanType(), True),
    StructField("houseRules", BooleanType(), True),
    StructField("noOfMgrs", IntegerType(), True),
    StructField("hostSince", DateType(), True),
    StructField("statePrty", BooleanType(), True),
    StructField("locPrty", BooleanType(), True),
    StructField("abtHost", BooleanType(), True),
    StructField("hostRespRate", FloatType(), True),
    StructField("superhost", BooleanType(), True),
    StructField("hostCount", IntegerType(), True),
    StructField("verEmail", BooleanType(), True),
    StructField("verPhone", BooleanType(), True),
    StructField("verNone", BooleanType(), True),
    StructField("verOffGovID", BooleanType(), True),
    StructField("verReviews", BooleanType(), True),
    StructField("verWorkEmail", BooleanType(), True),
    StructField("verGoogle", BooleanType(), True),
    StructField("verFB", BooleanType(), True),
    StructField("verJumio", BooleanType(), True),
    StructField("verZhima", BooleanType(), True),
    StructField("verSelfie", BooleanType(), True),
    StructField("verManualOnline", BooleanType(), True),
    StructField("verKBA", BooleanType(), True),
    StructField("verGovID", BooleanType(), True),
    StructField("verManualOff", BooleanType(), True),
    StructField("verSentID", BooleanType(), True),
    StructField("verWeibo", BooleanType(), True),
    StructField("verIDManual", BooleanType(), True),
    StructField("hostDP", BooleanType(), True),
    StructField("hostIDVer", BooleanType(), True),
    StructField("neighbourhood", StringType(), True),
    StructField("neighbourhoodCld", StringType(), True),
    StructField("city", StringType(), True),
    StructField("zipcode", IntegerType(), True),
    StructField("lat", FloatType(), True),
    StructField("long", FloatType(), True),
    StructField("locExact", BooleanType(), True),
    StructField("propType", StringType(), True),
    StructField("roomType", StringType(), True),
    StructField("accd", IntegerType(), True),
    StructField("baths", IntegerType(), True),
    StructField("bedrooms", IntegerType(), True),
    StructField("beds", IntegerType(), True),
    StructField("bedType", StringType(), True),
    StructField("sqft", IntegerType(), True),
    StructField("price", IntegerType(), True),
    StructField("weekPrice", IntegerType(), True),
    StructField("monthPrice", IntegerType(), True),
    StructField("secDep", IntegerType(), True),
    StructField("cleanFee", IntegerType(), True),
    StructField("guestInc", IntegerType(), True),
    StructField("xPeople", IntegerType(), True),
    StructField("minNights", IntegerType(), True),
    StructField("maxNights", IntegerType(), True),
    StructField("minMinNights", IntegerType(), True),
    StructField("maxMinNights", IntegerType(), True),
    StructField("minMaxNights", IntegerType(), True),
    StructField("maxMaxNights", IntegerType(), True),
    StructField("minNightsAvg", IntegerType(), True),
    StructField("maxNightsAvg", IntegerType(), True),
    StructField("hasAvail", BooleanType(), True),
    StructField("avail30", IntegerType(), True),
    StructField("avail60", IntegerType(), True),
    StructField("avail90", IntegerType(), True),
    StructField("avail365", IntegerType(), True),
    StructField("numOfReviews", IntegerType(), True),
    StructField("numOfReviewsLtm", IntegerType(), True),
    StructField("firstRv", DateType(), True),
    StructField("lastRv", DateType(), True),
    StructField("rvScoreRating", IntegerType(), True),
    StructField("rvScoreAcc", IntegerType(), True),
    StructField("rvScoreClean", IntegerType(), True),
    StructField("rvScoreCheckin", IntegerType(), True),
    StructField("rvScoreComm", IntegerType(), True),
    StructField("rvScoreLoc", IntegerType(), True),
    StructField("rvScoreVal", IntegerType(), True),
    StructField("requireLic", BooleanType(), True),
    StructField("instBook", BooleanType(), True),
    StructField("businessTravel", BooleanType(), True),
    StructField("requireGstDP", BooleanType(), True),
    StructField("requireGstPhone", BooleanType(), True),
    StructField("calHostList", IntegerType(), True),
    StructField("calHostListEntire", IntegerType(), True),
    StructField("calHostListPrvtRoom", IntegerType(), True),
    StructField("calHostListPrvtShared", IntegerType(), True),
    StructField("rvPerMonth", FloatType(), True),
    StructField("amInt", BooleanType(), True),
    StructField("amTV", BooleanType(), True),
    StructField("amWifi", BooleanType(), True),
    StructField("amCableTV", BooleanType(), True),
    StructField("amKitchen", BooleanType(), True),
    StructField("amAirCon", BooleanType(), True),
    StructField("amPets", BooleanType(), True),
    StructField("amWasher", BooleanType(), True),
    StructField("amFreePark", BooleanType(), True),
    StructField("amHeat", BooleanType(), True),
    StructField("amSmokeDet", BooleanType(), True),
    StructField("amBF", BooleanType(), True),
    StructField("amHotWater", BooleanType(), True),
    StructField("amStove", BooleanType(), True),
    StructField("amCookBscs", BooleanType(), True),
    StructField("amMWave", BooleanType(), True),
    StructField("amXtraPillow", BooleanType(), True),
    StructField("amCoffeeMkr", BooleanType(), True),
    StructField("amBldgStaff", BooleanType(), True),
    StructField("amEthernet", BooleanType(), True),
    StructField("amLugDrop", BooleanType(), True),
    StructField("amLaptop", BooleanType(), True),
    StructField("amLockbox", BooleanType(), True),
    StructField("amOven", BooleanType(), True),
    StructField("amChildBooks", BooleanType(), True),
    StructField("amDishwasher", BooleanType(), True),
    StructField("amBedlinens", BooleanType(), True),
    StructField("amChildDinWare", BooleanType(), True),
    StructField("amLongterm", BooleanType(), True),
    StructField("amFridge", BooleanType(), True),
    StructField("amDishes", BooleanType(), True),
    StructField("amPvtLiving", BooleanType(), True),
    StructField("amGarden", BooleanType(), True),
    StructField("amHighchair", BooleanType(), True),
    StructField("amCrib", BooleanType(), True),
    StructField("amHostGreet", BooleanType(), True),
    StructField("amBathtub", BooleanType(), True),
    StructField("amNoSteps", BooleanType(), True),
    StructField("amPatio", BooleanType(), True),
    StructField("amBathEss", BooleanType(), True),
    StructField("amPvtEnt", BooleanType(), True),
    StructField("amBabyMtr", BooleanType(), True),
    StructField("amAccHighBed", BooleanType(), True),
    StructField("amOutletCvrs", BooleanType(), True),
    StructField("amSelfCheck", BooleanType(), True),
    StructField("amKeypad", BooleanType(), True),
    StructField("amTrvlCrib", BooleanType(), True),
    StructField("amHostAmty50", BooleanType(), True),
    StructField("amBathTwl", BooleanType(), True),
    StructField("amChangeTbl", BooleanType(), True),
    StructField("amBabysitRec", BooleanType(), True),
    StructField("amRoomDark", BooleanType(), True),
    StructField("amStairGates", BooleanType(), True),
    StructField("amPcktWifi", BooleanType(), True),
    StructField("amPaidPark", BooleanType(), True),
    StructField("amIncOther", BooleanType(), True),
    StructField("amBabyBath", BooleanType(), True),
    StructField("amHangers", BooleanType(), True),
    StructField("amFireGd", BooleanType(), True),
    StructField("amBodySoap", BooleanType(), True),
    StructField("amGameCon", BooleanType(), True),
    StructField("amSingHome", BooleanType(), True),
    StructField("amTolPpr", BooleanType(), True),
    StructField("amLitEnt", BooleanType(), True),
    StructField("amIron", BooleanType(), True),
    StructField("amXtrBedSp", BooleanType(), True),
    StructField("amSmartLock", BooleanType(), True),
    StructField("amHostAmty49", BooleanType(), True),
    StructField("amDisPark", BooleanType(), True),
    StructField("amWideEntGst", BooleanType(), True),
    StructField("amWideEnt", BooleanType(), True),
    StructField("amPvtBath", BooleanType(), True),
    StructField("amWideHall", BooleanType(), True),
    StructField("amBBQ", BooleanType(), True),
    StructField("amClnBfCo", BooleanType(), True),
    StructField("amWinGrd", BooleanType(), True),
    StructField("amFullChkin", BooleanType(), True),
    StructField("amTableCnr", BooleanType(), True),
    StructField("amBedComf", BooleanType(), True),
    StructField("amFullKchn", BooleanType(), True),
    StructField("amNetflix", BooleanType(), True),
    StructField("amTerrace", BooleanType(), True),
    StructField("amMemMat", BooleanType(), True),
    StructField("amHairDyr", BooleanType(), True),
    StructField("amSwrChair", BooleanType(), True),
    StructField("amGdFlrAcc", BooleanType(), True),
    StructField("amLakeAcc", BooleanType(), True),
    StructField("amBeachFnt", BooleanType(), True),
    StructField("amEvChgr", BooleanType(), True),
    StructField("amFlatEntGst", BooleanType(), True),
    StructField("amFxdGrabar", BooleanType(), True),
    StructField("amHandShwr", BooleanType(), True),
    StructField("amBathChr", BooleanType(), True),
    StructField("amKettle", BooleanType(), True),
    StructField("amHighTlt", BooleanType(), True),
    StructField("amWideEnt2", BooleanType(), True),
    StructField("amWalkShwr", BooleanType(), True),
    StructField("amSoakTub", BooleanType(), True),
    StructField("amEspresso", BooleanType(), True),
    StructField("amOutSeat", BooleanType(), True),
    StructField("amBeachView", BooleanType(), True),
    StructField("amWideShwrTlt", BooleanType(), True),
    StructField("amMurphyBed", BooleanType(), True),
    StructField("amEnSuiteBath", BooleanType(), True),
    StructField("amGasOven", BooleanType(), True),
    StructField("amPilTopMat", BooleanType(), True),
    StructField("amFax", BooleanType(), True),
    StructField("amDayBed", BooleanType(), True),
    StructField("amHeatFlr", BooleanType(), True),
    StructField("amBalcony", BooleanType(), True),
    StructField("amAmznEcho", BooleanType(), True),
    StructField("amWideDrGstBath", BooleanType(), True),
    StructField("amWarmDrwr", BooleanType(), True),
    StructField("amSndSys", BooleanType(), True),
    StructField("amMountView", BooleanType(), True),
    StructField("amDVDPlay", BooleanType(), True),
    StructField("amSunLounge", BooleanType(), True),
    StructField("amConvOven", BooleanType(), True),
    StructField("amSmartTV", BooleanType(), True),
    StructField("amDineArea", BooleanType(), True),
    StructField("amOutPark", BooleanType(), True),
    StructField("amBidet", BooleanType(), True),
    StructField("amProj", BooleanType(), True),
    StructField("amMudroom", BooleanType(), True),
    StructField("amPvtHotTub", BooleanType(), True),
    StructField("amCeilFan", BooleanType(), True),
    StructField("amRainShwr", BooleanType(), True),
    StructField("amPrint", BooleanType(), True),
    StructField("amBeachEss", BooleanType(), True),
    StructField("amHeatLamp", BooleanType(), True),
    StructField("amWaterFrt", BooleanType(), True),
    StructField("amHBO", BooleanType(), True),
    StructField("amDryer", BooleanType(), True),
    StructField("amEss", BooleanType(), True),
    StructField("amFireEx", BooleanType(), True),
    StructField("amCarbonDet", BooleanType(), True),
    StructField("amSafeCard", BooleanType(), True),
    StructField("amShampoo", BooleanType(), True),
    StructField("amElev", BooleanType(), True),
    StructField("amFamFrdly", BooleanType(), True),
    StructField("amIndoorFire", BooleanType(), True),
    StructField("amFirstaid", BooleanType(), True),
    StructField("amLockBed", BooleanType(), True),
    StructField("amSuitEvents", BooleanType(), True),
    StructField("amWlessIcom", BooleanType(), True),
    StructField("amDogs", BooleanType(), True),
    StructField("amFreeStPark", BooleanType(), True),
    StructField("amPetsProp", BooleanType(), True),
    StructField("amHotTub", BooleanType(), True),
    StructField("amGym", BooleanType(), True),
    StructField("amKitnte", BooleanType(), True),
    StructField("amJetTub", BooleanType(), True),
    StructField("amBFTable", BooleanType(), True),
    StructField("amMiniFridge", BooleanType(), True),
    StructField("amElecProfBed", BooleanType(), True),
    StructField("amWineCool", BooleanType(), True),
    StructField("amFirePit", BooleanType(), True),
    StructField("amExcse", BooleanType(), True),
    StructField("amStepFreeShwr", BooleanType(), True),
    StructField("amFixGrabTlt", BooleanType(), True),
    StructField("amCats", BooleanType(), True),
    StructField("amPools", BooleanType(), True),
    StructField("amPaidParkOff", BooleanType(), True),
    StructField("amDoorman", BooleanType(), True),
    StructField("amWshrDryr", BooleanType(), True),
    StructField("amSmoke", BooleanType(), True),
    StructField("amWheelchair", BooleanType(), True),
    StructField("amOthrPets", BooleanType(), True),
    StructField("amFirmMat", BooleanType(), True),
    StructField("amStandValet", BooleanType(), True),
    StructField("amTennis", BooleanType(), True),
    StructField("amSteamOven", BooleanType(), True),
    StructField("amHighResComp", BooleanType(), True),
    StructField("amDblOven", BooleanType(), True),
    StructField("amShareHotub", BooleanType(), True),
    StructField("amHamm", BooleanType(), True),
    StructField("amSki", BooleanType(), True),
    StructField("amShareGym", BooleanType(), True),
    StructField("amAirPrfr", BooleanType(), True),
    StructField("amHeatTwlRack", BooleanType(), True),
    StructField("amStandAlSteam", BooleanType(), True),
    StructField("amCtrlAC", BooleanType(), True),
    StructField("amTouchlsFauc", BooleanType(), True),
    StructField("amSharedPool", BooleanType(), True),
    
])   

In [3]:
#Using a DataFrame to import the dataset in CSV format using the comma delimiter
listDataframe = spark.read.csv(
    "datasets/listing.csv", 
    header=True, schema=listSchema, sep=",")
#show 3 rows of our DataFrame
listDataframe.show(3)

+----+-----------+-------+---------+----+--------------+-----+-------+-----+--------+----------+--------+----------+---------+-------+-------+------------+---------+---------+--------+--------+-------+-----------+----------+------------+---------+-----+--------+--------+---------+---------------+------+--------+------------+---------+--------+-----------+------+---------+---------------+----------------+-------------+-------+--------+----------+--------+---------+---------------+----+-----+--------+----+--------+----+-----+---------+----------+------+--------+--------+-------+---------+---------+------------+------------+------------+------------+------------+------------+--------+-------+-------+-------+--------+------------+---------------+----------+----------+-------------+----------+------------+--------------+-----------+----------+----------+----------+--------+--------------+------------+---------------+-----------+-----------------+-------------------+---------------------+---

In [4]:
#selecting the attributes from the DataFrame
data = listDataframe.select("id",
"lastScraped",
"summary",
"spaceDesc",
"desc",
"nbhoodOverview",
"notes",
"transit",
"acess",
"interact",
"houseRules",
"noOfMgrs",
"hostSince",
"statePrty",
"locPrty",
"abtHost",
"hostRespRate",
"superhost",
"hostCount",
"verEmail",
"verPhone",
"verNone",
"verOffGovID",
"verReviews",
"verWorkEmail",
"verGoogle",
"verFB",
"verJumio",
"verZhima",
"verSelfie",
"verManualOnline",
"verKBA",
"verGovID",
"verManualOff",
"verSentID",
"verWeibo",
"verIDManual",
"hostDP",
"hostIDVer",
"neighbourhood",
"neighbourhoodCld",
"city",
"zipcode",
"lat",
"long",
"locExact",
"propType",
"roomType",
"accd",
"baths",
"bedrooms",
"beds",
"bedType",
"sqft",
"price",
"weekPrice",
"monthPrice",
"secDep",
"cleanFee",
"guestInc",
"xPeople",
"minNights",
"maxNights",
"minMinNights",
"maxMinNights",
"minMaxNights",
"maxMaxNights",
"minNightsAvg",
"maxNightsAvg",
"hasAvail",
"avail30",
"avail60",
"avail90",
"avail365",
"numOfReviews",
"numOfReviewsLtm",
"firstRv",
"lastRv",
"rvScoreRating",
"rvScoreAcc",
"rvScoreClean",
"rvScoreCheckin",
"rvScoreComm",
"rvScoreLoc",
"rvScoreVal",
"requireLic",
"instBook",
"businessTravel",
"requireGstDP",
"requireGstPhone",
"calHostList",
"calHostListEntire",
"calHostListPrvtRoom",
"calHostListPrvtShared",
"rvPerMonth",
"amInt",
"amTV",
"amWifi",
"amCableTV",
"amKitchen",
"amAirCon",
"amPets",
"amWasher",
"amFreePark",
"amHeat",
"amSmokeDet",
"amBF",
"amHotWater",
"amStove",
"amCookBscs",
"amMWave",
"amXtraPillow",
"amCoffeeMkr",
"amBldgStaff",
"amEthernet",
"amLugDrop",
"amLaptop",
"amLockbox",
"amOven",
"amChildBooks",
"amDishwasher",
"amBedlinens",
"amChildDinWare",
"amLongterm",
"amFridge",
"amDishes",
"amPvtLiving",
"amGarden",
"amHighchair",
"amCrib",
"amHostGreet",
"amBathtub",
"amNoSteps",
"amPatio",
"amBathEss",
"amPvtEnt",
"amBabyMtr",
"amAccHighBed",
"amOutletCvrs",
"amSelfCheck",
"amKeypad",
"amTrvlCrib",
"amHostAmty50",
"amBathTwl",
"amChangeTbl",
"amBabysitRec",
"amRoomDark",
"amStairGates",
"amPcktWifi",
"amPaidPark",
"amIncOther",
"amBabyBath",
"amHangers",
"amFireGd",
"amBodySoap",
"amGameCon",
"amSingHome",
"amTolPpr",
"amLitEnt",
"amIron",
"amXtrBedSp",
"amSmartLock",
"amHostAmty49",
"amDisPark",
"amWideEntGst",
"amWideEnt",
"amPvtBath",
"amWideHall",
"amBBQ",
"amClnBfCo",
"amWinGrd",
"amFullChkin",
"amTableCnr",
"amBedComf",
"amFullKchn",
"amNetflix",
"amTerrace",
"amMemMat",
"amHairDyr",
"amSwrChair",
"amGdFlrAcc",
"amLakeAcc",
"amBeachFnt",
"amEvChgr",
"amFlatEntGst",
"amFxdGrabar",
"amHandShwr",
"amBathChr",
"amKettle",
"amHighTlt",
"amWideEnt2",
"amWalkShwr",
"amSoakTub",
"amEspresso",
"amOutSeat",
"amBeachView",
"amWideShwrTlt",
"amMurphyBed",
"amEnSuiteBath",
"amGasOven",
"amPilTopMat",
"amFax",
"amDayBed",
"amHeatFlr",
"amBalcony",
"amAmznEcho",
"amWideDrGstBath",
"amWarmDrwr",
"amSndSys",
"amMountView",
"amDVDPlay",
"amSunLounge",
"amConvOven",
"amSmartTV",
"amDineArea",
"amOutPark",
"amBidet",
"amProj",
"amMudroom",
"amPvtHotTub",
"amCeilFan",
"amRainShwr",
"amPrint",
"amBeachEss",
"amHeatLamp",
"amWaterFrt",
"amHBO",
"amDryer",
"amEss",
"amFireEx",
"amCarbonDet",
"amSafeCard",
"amShampoo",
"amElev",
"amFamFrdly",
"amIndoorFire",
"amFirstaid",
"amLockBed",
"amSuitEvents",
"amWlessIcom",
"amDogs",
"amFreeStPark",
"amPetsProp",
"amHotTub",
"amGym",
"amKitnte",
"amJetTub",
"amBFTable",
"amMiniFridge",
"amElecProfBed",
"amWineCool",
"amFirePit",
"amExcse",
"amStepFreeShwr",
"amFixGrabTlt",
"amCats",
"amPools",
"amPaidParkOff",
"amDoorman",
"amWshrDryr",
"amSmoke",
"amWheelchair",
"amOthrPets",
"amFirmMat",
"amStandValet",
"amTennis",
"amSteamOven",
"amHighResComp",
"amDblOven",
"amShareHotub",
"amHamm",
"amSki",
"amShareGym",
"amAirPrfr",
"amHeatTwlRack",
"amStandAlSteam",
"amCtrlAC",
"amTouchlsFauc",
"amSharedPool")

data.show(3)

+----+-----------+-------+---------+----+--------------+-----+-------+-----+--------+----------+--------+----------+---------+-------+-------+------------+---------+---------+--------+--------+-------+-----------+----------+------------+---------+-----+--------+--------+---------+---------------+------+--------+------------+---------+--------+-----------+------+---------+---------------+----------------+-------------+-------+--------+----------+--------+---------+---------------+----+-----+--------+----+--------+----+-----+---------+----------+------+--------+--------+-------+---------+---------+------------+------------+------------+------------+------------+------------+--------+-------+-------+-------+--------+------------+---------------+----------+----------+-------------+----------+------------+--------------+-----------+----------+----------+----------+--------+--------------+------------+---------------+-----------+-----------------+-------------------+---------------------+---

In [5]:
#Dataset divided into 70% training set and 30% testing set
dataDivisionPercentage = data.randomSplit([0.7, 0.3]) 
#Data at index 0 is for training
trainingData = dataDivisionPercentage[0]
#Data at index 1 is for testing
testingData = dataDivisionPercentage[1] 
countTrainingData = trainingData.count()
countTestingData = testingData.count()
print ("Training data rows:", countTrainingData, "; Testing data rows:", countTestingData)

Training data rows: 5757 ; Testing data rows: 2354


In [6]:
#Defining a VectorAssembler to select the attributes needed for training the model
assembler = VectorAssembler(inputCols = ["id",
"summary",
"spaceDesc",
"desc",
"nbhoodOverview",
"notes",
"transit",
"acess",
"interact",
"houseRules",
"noOfMgrs",
"statePrty",
"locPrty",
"abtHost",
"hostRespRate",
"superhost",
"hostCount",
"verEmail",
"verPhone",
"verNone",
"verOffGovID",
"verReviews",
"verWorkEmail",
"verGoogle",
"verFB",
"verJumio",
"verZhima",
"verSelfie",
"verManualOnline",
"verKBA",
"verGovID",
"verManualOff",
"verSentID",
"verWeibo",
"verIDManual",
"hostDP",
"hostIDVer",
"zipcode",
"lat",
"long",
"locExact",
"accd",
"baths",
"bedrooms",
"beds",
"sqft",
"price",
"weekPrice",
"monthPrice",
"secDep",
"cleanFee",
"guestInc",
"xPeople",
"minNights",
"maxNights",
"minMinNights",
"maxMinNights",
"minMaxNights",
"maxMaxNights",
"minNightsAvg",
"maxNightsAvg",
"hasAvail",
"avail30",
"avail60",
"avail90",
"avail365",
"numOfReviews",
"numOfReviewsLtm",
"rvScoreRating",
"rvScoreAcc",
"rvScoreClean",
"rvScoreCheckin",
"rvScoreComm",
"rvScoreLoc",
"rvScoreVal",
"requireLic",
"instBook",
"businessTravel",
"requireGstDP",
"requireGstPhone",
"calHostList",
"calHostListEntire",
"calHostListPrvtRoom",
"calHostListPrvtShared",
"amInt",
"amTV",
"amWifi",
"amCableTV",
"amKitchen",
"amAirCon",
"amPets",
"amWasher",
"amFreePark",
"amHeat",
"amSmokeDet",
"amBF",
"amHotWater",
"amStove",
"amCookBscs",
"amMWave",
"amXtraPillow",
"amCoffeeMkr",
"amBldgStaff",
"amEthernet",
"amLugDrop",
"amLaptop",
"amLockbox",
"amOven",
"amChildBooks",
"amDishwasher",
"amBedlinens",
"amChildDinWare",
"amLongterm",
"amFridge",
"amDishes",
"amPvtLiving",
"amGarden",
"amHighchair",
"amCrib",
"amHostGreet",
"amBathtub",
"amNoSteps",
"amPatio",
"amBathEss",
"amPvtEnt",
"amBabyMtr",
"amAccHighBed",
"amOutletCvrs",
"amSelfCheck",
"amKeypad",
"amTrvlCrib",
"amHostAmty50",
"amBathTwl",
"amChangeTbl",
"amBabysitRec",
"amRoomDark",
"amStairGates",
"amPcktWifi",
"amPaidPark",
"amIncOther",
"amBabyBath",
"amHangers",
"amFireGd",
"amBodySoap",
"amGameCon",
"amSingHome",
"amTolPpr",
"amLitEnt",
"amIron",
"amXtrBedSp",
"amSmartLock",
"amHostAmty49",
"amDisPark",
"amWideEntGst",
"amWideEnt",
"amPvtBath",
"amWideHall",
"amBBQ",
"amClnBfCo",
"amWinGrd",
"amFullChkin",
"amTableCnr",
"amBedComf",
"amFullKchn",
"amNetflix",
"amTerrace",
"amMemMat",
"amHairDyr",
"amSwrChair",
"amGdFlrAcc",
"amLakeAcc",
"amBeachFnt",
"amEvChgr",
"amFlatEntGst",
"amFxdGrabar",
"amHandShwr",
"amBathChr",
"amKettle",
"amHighTlt",
"amWideEnt2",
"amWalkShwr",
"amSoakTub",
"amEspresso",
"amOutSeat",
"amBeachView",
"amWideShwrTlt",
"amMurphyBed",
"amEnSuiteBath",
"amGasOven",
"amPilTopMat",
"amFax",
"amDayBed",
"amHeatFlr",
"amBalcony",
"amAmznEcho",
"amWideDrGstBath",
"amWarmDrwr",
"amSndSys",
"amMountView",
"amDVDPlay",
"amSunLounge",
"amConvOven",
"amSmartTV",
"amDineArea",
"amOutPark",
"amBidet",
"amProj",
"amMudroom",
"amPvtHotTub",
"amCeilFan",
"amRainShwr",
"amPrint",
"amBeachEss",
"amHeatLamp",
"amWaterFrt",
"amHBO",
"amDryer",
"amEss",
"amFireEx",
"amCarbonDet",
"amSafeCard",
"amShampoo",
"amElev",
"amFamFrdly",
"amIndoorFire",
"amFirstaid",
"amLockBed",
"amSuitEvents",
"amWlessIcom",
"amDogs",
"amFreeStPark",
"amPetsProp",
"amHotTub",
"amGym",
"amKitnte",
"amJetTub",
"amBFTable",
"amMiniFridge",
"amElecProfBed",
"amWineCool",
"amFirePit",
"amExcse",
"amStepFreeShwr",
"amFixGrabTlt",
"amCats",
"amPools",
"amPaidParkOff",
"amDoorman",
"amWshrDryr",
"amSmoke",
"amWheelchair",
"amOthrPets",
"amFirmMat",
"amStandValet",
"amTennis",
"amSteamOven",
"amHighResComp",
"amDblOven",
"amShareHotub",
"amHamm",
"amSki",
"amShareGym",
"amAirPrfr",
"amHeatTwlRack",
"amStandAlSteam",
"amCtrlAC",
"amTouchlsFauc",
"amSharedPool"], outputCol="features")

#Converting all the features selected to a single column for the algorithm and specifying the class attribute
assembler.setHandleInvalid("skip").transform(trainingData).show
trainingDataCol = assembler.transform(trainingData).select(
    col("features"), (col("rvPerMonth").cast("Float").alias("label")))
trainingDataCol.show(truncate=False , n=10)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                              

In [7]:
#Using the LinearRegression module to train the model
regressionAlgorithm = LinearRegression(
    labelCol="label",featuresCol="features", 
    maxIter=10, regParam=0.7, elasticNetParam=0.97)

#Training the model using the specified parameters
model = regressionAlgorithm.fit(trainingDataCol)
print ("*** Regression model trained ***")

*** Regression model trained ***


In [15]:
#Changing the test data to one column as well using the assembler and selecting the class attribute
testingDataCol = assembler.transform(
    testingData).select(
    col("features"), (col("rvPerMonth")).cast("Float").alias("trueLabel"))
#testingDataCol.show(truncate=False, n=2)

In [16]:
#Predicting the values using the test set defined on the model created
prediction = model.transform(testingDataCol)
#Output of the predicted values
prediction.show(5)

+--------------------+---------+------------------+
|            features|trueLabel|        prediction|
+--------------------+---------+------------------+
|(268,[0,1,2,3,4,6...|     0.13|0.8340113874199149|
|(268,[0,1,2,3,4,5...|      0.5|0.8373073597867823|
|(268,[0,1,2,3,4,6...|     1.26|1.6186376680674177|
|(268,[0,1,2,3,4,5...|     2.05|1.6298727819566199|
|(268,[0,2,3,4,5,6...|     1.48|1.6666191108727504|
+--------------------+---------+------------------+
only showing top 5 rows



In [17]:
#Importing RegressionEvaluator to evaluate the model
from pyspark.ml.evaluation import RegressionEvaluator

#Defining evaluators for each metric 
evaluator1 = RegressionEvaluator(
    labelCol="trueLabel", predictionCol="prediction", metricName="mse")
evaluator2 = RegressionEvaluator(
    labelCol="trueLabel", predictionCol="prediction", metricName="rmse")
evaluator3 = RegressionEvaluator(
    labelCol="trueLabel", predictionCol="prediction", metricName="r2")
evaluator4 = RegressionEvaluator(
    labelCol="trueLabel", predictionCol="prediction", metricName="mae")

#Calculating the MSE, RMSE, MAE and R2 of the trained model
rmse = evaluator2.evaluate(prediction)
mse = evaluator1.evaluate(prediction)
r2 = evaluator3.evaluate(prediction)
mae = evaluator4.evaluate(prediction)

print ("Mean Square Error       (MSE):", mse)
print ("Root Mean Square Error (RMSE):", rmse)
print ("Mean Average Error      (MAE):", mae)
print ("R Squared                (R2):", r2)


Mean Square Error       (MSE): 0.42387210846842743
Root Mean Square Error (RMSE): 0.6510546125083728
Mean Average Error      (MAE): 0.5383857216041988
R Squared                (R2): 0.6664469017478722
