## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/tips.csv"
file_type = "csv"


# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.csv(file_location,inferSchema=True,header=True,sep=',')

display(df)

total_bill,tip,sex,smoker,day,time,size
16.99,1.01,Female,No,Sun,Dinner,2
10.34,1.66,Male,No,Sun,Dinner,3
21.01,3.5,Male,No,Sun,Dinner,3
23.68,3.31,Male,No,Sun,Dinner,2
24.59,3.61,Female,No,Sun,Dinner,4
25.29,4.71,Male,No,Sun,Dinner,4
8.77,2.0,Male,No,Sun,Dinner,2
26.88,3.12,Male,No,Sun,Dinner,4
15.04,1.96,Male,No,Sun,Dinner,2
14.78,3.23,Male,No,Sun,Dinner,2


In [0]:
type(df)

In [0]:
from pyspark.ml.feature import StringIndexer,VectorAssembler
from pyspark.ml.regression import LinearRegression

In [0]:
df.printSchema()

In [0]:
indexer=StringIndexer(inputCol='sex',outputCol='sex_indexed')
df_r=indexer.fit(df).transform(df)

In [0]:
df_r.printSchema()

In [0]:
display(df_r)

total_bill,tip,sex,smoker,day,time,size,sex_indexed
16.99,1.01,Female,No,Sun,Dinner,2,1.0
10.34,1.66,Male,No,Sun,Dinner,3,0.0
21.01,3.5,Male,No,Sun,Dinner,3,0.0
23.68,3.31,Male,No,Sun,Dinner,2,0.0
24.59,3.61,Female,No,Sun,Dinner,4,1.0
25.29,4.71,Male,No,Sun,Dinner,4,0.0
8.77,2.0,Male,No,Sun,Dinner,2,0.0
26.88,3.12,Male,No,Sun,Dinner,4,0.0
15.04,1.96,Male,No,Sun,Dinner,2,0.0
14.78,3.23,Male,No,Sun,Dinner,2,0.0


In [0]:
indexer2=StringIndexer(inputCols=['smoker','day','time'],outputCols=['smoker_indexed','day_indexed','time_indexed'])
dataset=indexer2.fit(df_r).transform(df_r)
display(dataset)

total_bill,tip,sex,smoker,day,time,size,sex_indexed,smoker_indexed,day_indexed,time_indexed
16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0
10.34,1.66,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0
21.01,3.5,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0
23.68,3.31,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0
25.29,4.71,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0
8.77,2.0,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
26.88,3.12,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0
15.04,1.96,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
14.78,3.23,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0


In [0]:
featureAssembler=VectorAssembler(inputCols=['total_bill','size','sex_indexed','smoker_indexed','day_indexed','time_indexed'],outputCol='IndependentFeatures')
output=featureAssembler.transform(dataset)

In [0]:
display(output)

total_bill,tip,sex,smoker,day,time,size,sex_indexed,smoker_indexed,day_indexed,time_indexed,IndependentFeatures
16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(16.99, 2.0, 1.0, 0.0, 1.0, 0.0))"
10.34,1.66,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(10.34, 3.0, 0.0, 0.0, 1.0, 0.0))"
21.01,3.5,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(21.01, 3.0, 0.0, 0.0, 1.0, 0.0))"
23.68,3.31,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(23.68, 2.0, 0.0, 0.0, 1.0, 0.0))"
24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(24.59, 4.0, 1.0, 0.0, 1.0, 0.0))"
25.29,4.71,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(25.29, 4.0, 0.0, 0.0, 1.0, 0.0))"
8.77,2.0,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(8.77, 2.0, 0.0, 0.0, 1.0, 0.0))"
26.88,3.12,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(26.88, 4.0, 0.0, 0.0, 1.0, 0.0))"
15.04,1.96,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(15.04, 2.0, 0.0, 0.0, 1.0, 0.0))"
14.78,3.23,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0,"Map(vectorType -> dense, length -> 6, values -> List(14.78, 2.0, 0.0, 0.0, 1.0, 0.0))"


In [0]:
output.select('IndependentFeatures').show()

In [0]:
finalized_data=output.select('IndependentFeatures','tip')

In [0]:
finalized_data.show()

In [0]:
type(finalized_data)

In [0]:
train_set,test_set=finalized_data.randomSplit([0.75,0.25])


In [0]:
regressor=LinearRegression(featuresCol='IndependentFeatures',labelCol='tip')

In [0]:
regressor=regressor.fit(train_set)

In [0]:
pred_results=regressor.evaluate(test_set)

In [0]:
pred_results.predictions.show()

In [0]:
pred_results.r2

In [0]:
import numpy as np

In [0]:
np.sqrt(pred_results.meanSquaredError)