In [1]:
import pyspark
from pyspark.sql import SparkSession         # This is how we start spark session.
spark = SparkSession.builder.appName('MachineLearningExample').getOrCreate()      # Starting the session with app name 'Practise'



In [2]:
## Read the dataset
training = spark.read.csv('water.csv', header = True, inferSchema = True)

In [3]:
training.show()

+----+-----+
|Year|Water|
+----+-----+
|1885|  356|
|1886|  386|
|1887|  397|
|1888|  397|
|1889|  413|
|1890|  458|
|1891|  485|
|1892|  344|
|1893|  390|
|1894|  360|
|1895|  420|
|1896|  435|
|1897|  439|
|1898|  454|
|1899|  462|
|1900|  454|
|1901|  469|
|1902|  500|
|1903|  492|
|1904|  473|
+----+-----+
only showing top 20 rows



In [4]:
training.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Water: integer (nullable = true)



In [5]:
training.columns

['Year', 'Water']

* In Pandas we split the data into train-test split for training and model validation but in Pyspark, we group all our independent features, by using Vector Assembler. 
* ["Year"] ----> new feature ----> independent feature

In [6]:
# Code for vector assembler
from pyspark.ml.feature import VectorAssembler
featureassembler1 = VectorAssembler(inputCols = ["Year"],outputCol = "Independent_Year")
featureassembler2 = VectorAssembler(inputCols = ["Year", "Water"],outputCol = "Independent_Year")

In [7]:
output = featureassembler1.transform(training)

In [8]:
output.show()

+----+-----+----------------+
|Year|Water|Independent_Year|
+----+-----+----------------+
|1885|  356|        [1885.0]|
|1886|  386|        [1886.0]|
|1887|  397|        [1887.0]|
|1888|  397|        [1888.0]|
|1889|  413|        [1889.0]|
|1890|  458|        [1890.0]|
|1891|  485|        [1891.0]|
|1892|  344|        [1892.0]|
|1893|  390|        [1893.0]|
|1894|  360|        [1894.0]|
|1895|  420|        [1895.0]|
|1896|  435|        [1896.0]|
|1897|  439|        [1897.0]|
|1898|  454|        [1898.0]|
|1899|  462|        [1899.0]|
|1900|  454|        [1900.0]|
|1901|  469|        [1901.0]|
|1902|  500|        [1902.0]|
|1903|  492|        [1903.0]|
|1904|  473|        [1904.0]|
+----+-----+----------------+
only showing top 20 rows



In [9]:
output.columns

['Year', 'Water', 'Independent_Year']

In [10]:
finalized_data = output.select("Independent_Year", "Water")    # Water is my target feature.

In [11]:
finalized_data.show()

+----------------+-----+
|Independent_Year|Water|
+----------------+-----+
|        [1885.0]|  356|
|        [1886.0]|  386|
|        [1887.0]|  397|
|        [1888.0]|  397|
|        [1889.0]|  413|
|        [1890.0]|  458|
|        [1891.0]|  485|
|        [1892.0]|  344|
|        [1893.0]|  390|
|        [1894.0]|  360|
|        [1895.0]|  420|
|        [1896.0]|  435|
|        [1897.0]|  439|
|        [1898.0]|  454|
|        [1899.0]|  462|
|        [1900.0]|  454|
|        [1901.0]|  469|
|        [1902.0]|  500|
|        [1903.0]|  492|
|        [1904.0]|  473|
+----------------+-----+
only showing top 20 rows



In [14]:
# For doing train test split and Linear Regression training
from pyspark.ml.regression import LinearRegression
train_data, test_data = finalized_data.randomSplit([0.75,0.25])   # train-test split
regressor = LinearRegression(featuresCol = "Independent_Year", labelCol = "Water")    # Defining linear regression model.
regressor = regressor.fit(train_data)   # fitting the model


In [15]:
## Coefficients
regressor.coefficients

DenseVector([2.6598])

In [16]:
## Intercepts
regressor.intercept

-4602.720425348935

In [17]:
## Prediction
pred_results = regressor.evaluate(test_data)


In [18]:
pred_results.predictions.show()



+----------------+-----+------------------+
|Independent_Year|Water|        prediction|
+----------------+-----+------------------+
|        [1886.0]|  386|413.57904994533783|
|        [1891.0]|  485| 426.8778290420887|
|        [1900.0]|  454|450.81563141623883|
|        [1906.0]|  469| 466.7741663323395|
|        [1909.0]|  466|474.75343379038986|
|        [1917.0]|  526| 496.0314803451911|
|        [1926.0]|  450| 519.9692827193412|
|        [1929.0]|  458| 527.9485501773916|
|        [1933.0]|  466| 538.5875734547926|
|        [1948.0]|  613| 578.4839107450434|
|        [1950.0]|  575|  583.803422383744|
|        [1954.0]|  568| 594.4424456611441|
|        [1955.0]|  575| 597.1022014804939|
|        [1957.0]|  587| 602.4217131191945|
|        [1959.0]|  594|  607.741224757895|
+----------------+-----+------------------+



Databricks is a platform where we can use pyspark and apache spark.