# Linear Regression Quiz
Use this Jupyter notebook to find the answer to the quiz in the previous section. There is an answer key in the next part of the lesson.

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, count, lit, udf, avg
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import RegexTokenizer, VectorAssembler
from pyspark.ml.regression import LinearRegression

# TODOS: 
# 1) import any other libraries you might need
# 2) run the cells below to read the dataset and extract description length features
# 3) write code to answer the quiz question

In [3]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Creating Features") \
    .getOrCreate()

### Read Dataset

In [4]:
stack_overflow_data = 'Train_onetag_small.json'

In [12]:
df = spark.read.json(stack_overflow_data)
df.persist()

DataFrame[Body: string, Id: bigint, Tags: string, Title: string, oneTag: string]

### Build Description Length Features

In [13]:
df = df.withColumn("Desc", concat(col("Title"), lit(' '), col("Body")))

In [14]:
body_length = udf(lambda x : len(x), IntegerType())

In [15]:
regexTokenizer = RegexTokenizer(inputCol="Desc", outputCol="words", pattern="\\W")
df = regexTokenizer.transform(df)
df = df.withColumn("DescLength", body_length(df.words))

In [16]:
assembler = VectorAssembler(inputCols=["DescLength"], outputCol="DescVec")
df = assembler.transform(df)

In [17]:
df.show(1)

+--------------------+---+--------------------+--------------------+------+--------------------+--------------------+----------+-------+
|                Body| Id|                Tags|               Title|oneTag|                Desc|               words|DescLength|DescVec|
+--------------------+---+--------------------+--------------------+------+--------------------+--------------------+----------+-------+
|<p>I'd like to ch...|  1|php image-process...|How to check if a...|   php|How to check if a...|[how, to, check, ...|        96| [96.0]|
+--------------------+---+--------------------+--------------------+------+--------------------+--------------------+----------+-------+
only showing top 1 row



In [22]:
df.select("Tags").collect()[:5]

[Row(Tags='php image-processing file-upload upload mime-types'),
 Row(Tags='firefox'),
 Row(Tags='r matlab machine-learning'),
 Row(Tags='c# url encoding'),
 Row(Tags='php api file-get-contents')]

In [24]:
# Doing a linear regression model against the number of tags
# compute number of tags

number_of_tags = udf(lambda x : len(x.split(" ")), IntegerType())
df = df.withColumn("NumTags", number_of_tags(df.Tags))

In [25]:
df.show(1)

+--------------------+---+--------------------+--------------------+------+--------------------+--------------------+----------+-------+-------+
|                Body| Id|                Tags|               Title|oneTag|                Desc|               words|DescLength|DescVec|NumTags|
+--------------------+---+--------------------+--------------------+------+--------------------+--------------------+----------+-------+-------+
|<p>I'd like to ch...|  1|php image-process...|How to check if a...|   php|How to check if a...|[how, to, check, ...|        96| [96.0]|      5|
+--------------------+---+--------------------+--------------------+------+--------------------+--------------------+----------+-------+-------+
only showing top 1 row



# Question
Build a linear regression model using the length of the combined Title + Body fields. What is the value of r^2 when fitting a model with `maxIter=5, regParam=0.0, fitIntercept=False, solver="normal"`?

In [31]:
# TODO: write your code to answer this question
lr = LinearRegression(maxIter=5, regParam=0.0, fitIntercept=False, solver="normal")

In [26]:
df.groupby("NumTags").agg(avg(col("DescLength"))).orderBy("NumTags").show()

+-------+------------------+
|NumTags|   avg(DescLength)|
+-------+------------------+
|      1|143.68776158175783|
|      2| 162.1539186134137|
|      3|181.26021064340088|
|      4|201.46530249110322|
|      5|227.64375266524522|
+-------+------------------+



In [28]:
data = df.select(col("NumTags").alias("label"), col("DescVec").alias("features"))
data.head()

Row(label=5, features=DenseVector([96.0]))

In [32]:
lrModel_q1 = lr.fit(data)

In [33]:
lrModel_q1.summary.r2

0.4455149596308522