In [6]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('linear_regression_docs').getOrCreate()

# If you're getting an error with numpy, please type 'sudo pip install numpy --user' into the EC2 console.
from pyspark.ml.regression import LinearRegression

In [30]:
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [31]:
df = spark.read.load("./finalEdited-HumanHappiness-BDAS-mining.csv", format="csv", header=True, inferSchema=True)

In [54]:
df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Countries: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- HUMAN FREEDOM (Score): double (nullable = true)
 |-- HUMAN FREEDOM (Rank): integer (nullable = true)
 |-- HUMAN FREEDOM (Quartile): integer (nullable = true)
 |-- Rule of Law: double (nullable = true)
 |-- Disappearances, Conflicts, and Terrorism: double (nullable = true)
 |-- Women Security & Safety: double (nullable = true)
 |-- Security & Safety: double (nullable = true)
 |-- Women Movement: double (nullable = true)
 |-- Movement: double (nullable = true)
 |-- Legal and Regulatory Restrictions: double (nullable = true)
 |-- State Control over Internet Access: double (nullable = true)
 |-- Expression & Information: double (nullable = true)
 |-- Same Sex Relationships: integer (nullable = true)
 |-- Divorce: double (nullable = true)
 |-- PERSONAL FREEDOM (Score): double (nullable = true)
 |-- PERSONAL FREEDOM (Rank): integer (nullable = true)
 |-- Government  e

In [37]:
# The input columns are the feature column names, and the output column is what you'd like the new column to be named. 
assembler = VectorAssembler(
    inputCols=['Year',
 'Rule of Law',
 'Disappearances, Conflicts, and Terrorism',
 'Women Security & Safety',
 'Security & Safety',
 'Women Movement',
 'Movement',
 'Legal and Regulatory Restrictions',
 'State Control over Internet Access',
 'Expression & Information',
 'Same Sex Relationships',
 'Divorce',
 'PERSONAL FREEDOM (Score)',
 'PERSONAL FREEDOM (Rank)',
 'Government  enterprises and investment',
 'Top marginal income tax rate',
 'Legal enforcement of contracts',
 'Reliability of police',
 'Gender Legal Rights Adjustment',
 'Money growth',
 'Inflation: Most recent year',
 'Compliance costs of importing and exporting',
 'Regulatory trade barriers',
 'Foreign ownership/investment restrictions',
 'Freedom to trade internationally',
 'Hiring regulations and minimum wage',
 'Labour market regulations',
 'Licensing restrictions',
 'Business regulations',
 'ECONOMIC FREEDOM (Score)',
 'ECONOMIC FREEDOM (Rank)'],
    outputCol="features")

In [39]:
# Now that we've created the assembler variable, let's actually transform the data.
output = assembler.transform(df)

In [41]:
# Using print schema, you see that the features output column has been added. 
output.printSchema()

# You can see that the features column is a dense vector that combines the various features as expected.
output.head(1)

root
 |-- Year: integer (nullable = true)
 |-- Countries: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- HUMAN FREEDOM (Score): double (nullable = true)
 |-- HUMAN FREEDOM (Rank): integer (nullable = true)
 |-- HUMAN FREEDOM (Quartile): integer (nullable = true)
 |-- Rule of Law: double (nullable = true)
 |-- Disappearances, Conflicts, and Terrorism: double (nullable = true)
 |-- Women Security & Safety: double (nullable = true)
 |-- Security & Safety: double (nullable = true)
 |-- Women Movement: double (nullable = true)
 |-- Movement: double (nullable = true)
 |-- Legal and Regulatory Restrictions: double (nullable = true)
 |-- State Control over Internet Access: double (nullable = true)
 |-- Expression & Information: double (nullable = true)
 |-- Same Sex Relationships: integer (nullable = true)
 |-- Divorce: double (nullable = true)
 |-- PERSONAL FREEDOM (Score): double (nullable = true)
 |-- PERSONAL FREEDOM (Rank): integer (nullable = true)
 |-- Government  e

[Row(Year=2017, Countries='Albania', Region='Eastern Europe', HUMAN FREEDOM (Score)=7.84, HUMAN FREEDOM (Rank)=38, HUMAN FREEDOM (Quartile)=1, Rule of Law=5.291751647, Disappearances, Conflicts, and Terrorism=10.0, Women Security & Safety=8.75, Security & Safety=9.276666667, Women Movement=10.0, Movement=10.0, Legal and Regulatory Restrictions=8.011111111, State Control over Internet Access=10.0, Expression & Information=8.607142857, Same Sex Relationships=10, Divorce=7.5, PERSONAL FREEDOM (Score)=8.005411457, PERSONAL FREEDOM (Rank)=46, Government  enterprises and investment=8, Top marginal income tax rate=9, Legal enforcement of contracts=4.387444055, Reliability of police=6.824168762, Gender Legal Rights Adjustment=0.9524, Money growth=9.253087019, Inflation: Most recent year=9.601214379, Compliance costs of importing and exporting=9.405327776, Regulatory trade barriers=7.706503738, Foreign ownership/investment restrictions=6.306105852, Freedom to trade internationally=8.343862962, 

In [48]:
# Let's select two columns (the feature and predictor).
# This is now in the appropriate format to be processed by Spark.
final_data = output.select("features",'HUMAN FREEDOM (Score)')
final_data.show()

+--------------------+---------------------+
|            features|HUMAN FREEDOM (Score)|
+--------------------+---------------------+
|[2017.0,5.2917516...|                 7.84|
|[2017.0,3.7960803...|                 4.99|
|[2017.0,3.4119028...|                  5.4|
|[2017.0,5.7447910...|                 6.86|
|[2017.0,4.9326094...|                 7.42|
|[2017.0,7.7762787...|                 8.62|
|[2017.0,8.1717488...|                 8.48|
|[2017.0,4.2912703...|                 6.22|
|[2017.0,6.4007902...|                 7.56|
|[2017.0,5.8980375...|                 6.63|
|[2017.0,3.1328924...|                 5.77|
|[2017.0,6.6992874...|                 7.02|
|[2017.0,5.1289249...|                 6.65|
|[2017.0,7.7511009...|                 8.29|
|[2017.0,4.2688720...|                 6.95|
|[2017.0,4.1803094...|                 6.77|
|[2017.0,6.1878977...|                 6.53|
|[2017.0,3.0514018...|                 6.61|
|[2017.0,5.4700803...|                 7.37|
|[2017.0,5

In [49]:
# Let's do a randomised 70/30 split. 
# Remember, you can use other splits depending on how easy/difficult it is to train your model.
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [51]:
# Let's see our training data.
train_data.describe().show()

# And our testing data.
test_data.describe().show()

+-------+---------------------+
|summary|HUMAN FREEDOM (Score)|
+-------+---------------------+
|  count|                 1089|
|   mean|    6.849339752456379|
| stddev|   1.3636070810487795|
|    min|          1.144439617|
|    max|                 9.12|
+-------+---------------------+

+-------+---------------------+
|summary|HUMAN FREEDOM (Score)|
+-------+---------------------+
|  count|                  480|
|   mean|   6.8944814110812525|
| stddev|   1.2504136342275411|
|    min|          1.144439617|
|    max|                 9.02|
+-------+---------------------+



In [55]:
#Now we can create a Linear Regression Model object. Because the feature column is named 'features', 
#we don't have to worry about it. However, as the labelCol isn't the default name, we have to specify 
#it's name (HUMAN FREEDOM (Score)).

lr = LinearRegression(labelCol='HUMAN FREEDOM (Score)')

In [56]:
# Fit the model to the data.
lrModel = lr.fit(train_data)

In [57]:
# Print the coefficients and intercept for linear regression.
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [-0.00018277014670637625,7.654760169521138e-05,0.000548587209026555,-0.0002969521859028834,0.0004621509253513168,8.126736376763305e-05,0.0002203958432705075,0.00013645706052480664,1.1055015377253783e-05,2.6358969603813347e-05,6.447738450940417e-05,9.356501343323879e-05,0.48433382035060557,-0.00041739736911423274,0.00020124736078860317,-9.484371032006791e-05,-0.00011518351207959659,-0.00010473466771452708,0.004775723521460875,0.0003226184233185234,0.0009838824595450762,0.00019980501455566833,-0.00032632782162414525,0.000489775441090885,0.00013541345508644569,-8.78043635220382e-05,0.00033996022488305173,0.00035358347097414877,0.00044863464056236823,0.4901599860024474,-0.00016528244883663514] Intercept: 0.5553189531134152


In [58]:
# Let's evaluate the model against the test data.
test_results = lrModel.evaluate(test_data)

In [59]:
# Interesting results! This shows the difference between the predicted value and the test data.
test_results.residuals.show()

# Let's get some evaluation metrics (as discussed in the previous linear regression notebook).
print("RSME: {}".format(test_results.rootMeanSquaredError))

+--------------------+
|           residuals|
+--------------------+
|0.004756463487226803|
|0.005346343729727199|
|0.004873526014503193|
|0.004630810619395076|
|-0.00123978363561...|
|-7.88046132460973...|
|-0.00186560417058...|
|-0.00453725310211...|
|-0.00727301761322...|
|0.001635385538195...|
|3.576984695721208E-4|
|0.006359674558751749|
|5.103593285378238E-4|
|0.004583432592867531|
|-0.00142678927525...|
|-5.40929165377512...|
|0.001471250360746...|
|-0.00272229073048...|
|0.002577049403576...|
|-0.00934269587721...|
+--------------------+
only showing top 20 rows

RSME: 0.004585245146298461


In [60]:
# We can also get the R2 value. 
print("R2: {}".format(test_results.r2))

R2: 0.999986525165413


In [61]:
final_data.describe().show()

+-------+---------------------+
|summary|HUMAN FREEDOM (Score)|
+-------+---------------------+
|  count|                 1569|
|   mean|    6.863149820104525|
| stddev|   1.3297674649773994|
|    min|          1.144439617|
|    max|                 9.12|
+-------+---------------------+

