In [84]:
from pyspark.sql import SparkSession

In [85]:
spark = SparkSession.builder.appName("Sample Data Processing").getOrCreate()

In [86]:
AutoMPGdf = spark.read.csv("D:/AutoMPG.csv", inferSchema = True, header = True)

#### shape of the data contained in AutoMPG.csv

In [87]:
print(AutoMPGdf.count(), len(AutoMPGdf.columns))

392 9


#### Features (or attributes) recorded for each automobile

In [88]:
AutoMPGdf.columns

['mpg',
 'cylinders',
 'displacement',
 'hp',
 'weight',
 'acceleration',
 'model_year',
 'origin',
 'car_name']

#### Provide a schema of the AutoMPG data set to verify that all relevant features contain numeric data type. Are there any columns/features that is not applicable in developing a Linear Regression algorithm? That is, does not meet the requirements/assumptions to use a Linear Regression model. If so, eliminate those columns from further analysis and regenerate the schema to ensure that the ‘offending’ column is removed from further analysis. Remember, it should not be permanently removed from the dataset. 

In [89]:
AutoMPGdf.printSchema()

root
 |-- mpg: double (nullable = true)
 |-- cylinders: integer (nullable = true)
 |-- displacement: double (nullable = true)
 |-- hp: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- model_year: integer (nullable = true)
 |-- origin: integer (nullable = true)
 |-- car_name: string (nullable = true)



In [95]:
AutoMPGdf1 = AutoMPGdf.select('mpg','cylinders','displacement','hp','weight','acceleration','model_year','origin')

In [91]:
#AutoMPGdf = AutoMPGdf.drop('car_name')
#Only the column 'car_name' has string values, so dropping it

In [96]:
AutoMPGdf1.printSchema()

root
 |-- mpg: double (nullable = true)
 |-- cylinders: integer (nullable = true)
 |-- displacement: double (nullable = true)
 |-- hp: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- model_year: integer (nullable = true)
 |-- origin: integer (nullable = true)



In [98]:
AutoMPGdf1.show()

+----+---------+------------+-----+------+------------+----------+------+
| mpg|cylinders|displacement|   hp|weight|acceleration|model_year|origin|
+----+---------+------------+-----+------+------------+----------+------+
|18.0|        8|       307.0|130.0|3504.0|        12.0|        70|     1|
|15.0|        8|       350.0|165.0|3693.0|        11.5|        70|     1|
|18.0|        8|       318.0|150.0|3436.0|        11.0|        70|     1|
|16.0|        8|       304.0|150.0|3433.0|        12.0|        70|     1|
|17.0|        8|       302.0|140.0|3449.0|        10.5|        70|     1|
|15.0|        8|       429.0|198.0|4341.0|        10.0|        70|     1|
|14.0|        8|       454.0|220.0|4354.0|         9.0|        70|     1|
|14.0|        8|       440.0|215.0|4312.0|         8.5|        70|     1|
|14.0|        8|       455.0|225.0|4425.0|        10.0|        70|     1|
|15.0|        8|       390.0|190.0|3850.0|         8.5|        70|     1|
|15.0|        8|       383.0|170.0|356

#### Evaluate the correlation between mpg and each of the independent variables (pairwise mpg and cylinders, mpg and displacement, etc.). Correlation coefficient value ranges from -1 to +1; closer to 1, stronger the relationship. 

In [43]:
from pyspark.sql.functions import corr

In [99]:
AutoMPGdf1.select(corr('mpg','cylinders')).show()

+--------------------+
|corr(mpg, cylinders)|
+--------------------+
| -0.7776175081260227|
+--------------------+



In [100]:
AutoMPGdf1.select(corr('mpg','displacement')).show()

+-----------------------+
|corr(mpg, displacement)|
+-----------------------+
|    -0.8051269467104577|
+-----------------------+



In [101]:
AutoMPGdf1.select(corr('mpg','hp')).show()

+-------------------+
|      corr(mpg, hp)|
+-------------------+
|-0.7784267838977761|
+-------------------+



In [102]:
AutoMPGdf1.select(corr('mpg','weight')).show()

+------------------+
| corr(mpg, weight)|
+------------------+
|-0.832244214831575|
+------------------+



In [103]:
AutoMPGdf1.select(corr('mpg','acceleration')).show()

+-----------------------+
|corr(mpg, acceleration)|
+-----------------------+
|    0.42332853690278693|
+-----------------------+



In [104]:
AutoMPGdf1.select(corr('mpg','model_year')).show()

+---------------------+
|corr(mpg, model_year)|
+---------------------+
|   0.5805409660907859|
+---------------------+



In [105]:
AutoMPGdf1.select(corr('mpg','origin')).show()

+------------------+
| corr(mpg, origin)|
+------------------+
|0.5652087567164604|
+------------------+



There's a negative(Strong) correlation between the variable mpg and variables weight, displacement, hp and cylinders., which means when there's an increase in mpg, the other above listed varibles decreases.

A strong positive correlation exists between mpg and origin AND mpg and model_year., which means with an increase in mpg, model_year increases.

A moderate correlation exists between mpg and accelaration.

#### Provide a listing of summary descriptive statistics such as average and standard deviation for each relevant attribute. Round all float data to two decimal places.

In [106]:
AutoMPGdf1.describe().show()

+-------+-----------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+
|summary|              mpg|         cylinders|      displacement|                hp|            weight|      acceleration|       model_year|            origin|
+-------+-----------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+
|  count|              392|               392|               392|               392|               392|               392|              392|               392|
|   mean|23.44591836734694| 5.471938775510204|194.41198979591837|104.46938775510205|2977.5841836734694|15.541326530612228| 75.9795918367347|1.5765306122448979|
| stddev|7.805007486571802|1.7057832474527845|104.64400390890465| 38.49115993282846| 849.4025600429486|  2.75886411918808|3.683736543577868|0.8055181834183057|
|    min|              9.0|             

#### Collapse all independent variables/features into a single vector in preparation for Linear Regression analysis. Print the first 5 rows of the new data.

In [107]:
from pyspark.ml.linalg import Vector

In [108]:
from pyspark.ml.feature import VectorAssembler

In [109]:
vec_assemble = VectorAssembler(inputCols = ['cylinders','displacement','hp','weight','acceleration','model_year','origin'], outputCol = 'features')

In [112]:
df_features = vec_assemble.transform(AutoMPGdf1)

In [113]:
df_features.printSchema()

root
 |-- mpg: double (nullable = true)
 |-- cylinders: integer (nullable = true)
 |-- displacement: double (nullable = true)
 |-- hp: double (nullable = true)
 |-- weight: double (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- model_year: integer (nullable = true)
 |-- origin: integer (nullable = true)
 |-- features: vector (nullable = true)



In [114]:
df_features.show(5,False)

+----+---------+------------+-----+------+------------+----------+------+--------------------------------------+
|mpg |cylinders|displacement|hp   |weight|acceleration|model_year|origin|features                              |
+----+---------+------------+-----+------+------------+----------+------+--------------------------------------+
|18.0|8        |307.0       |130.0|3504.0|12.0        |70        |1     |[8.0,307.0,130.0,3504.0,12.0,70.0,1.0]|
|15.0|8        |350.0       |165.0|3693.0|11.5        |70        |1     |[8.0,350.0,165.0,3693.0,11.5,70.0,1.0]|
|18.0|8        |318.0       |150.0|3436.0|11.0        |70        |1     |[8.0,318.0,150.0,3436.0,11.0,70.0,1.0]|
|16.0|8        |304.0       |150.0|3433.0|12.0        |70        |1     |[8.0,304.0,150.0,3433.0,12.0,70.0,1.0]|
|17.0|8        |302.0       |140.0|3449.0|10.5        |70        |1     |[8.0,302.0,140.0,3449.0,10.5,70.0,1.0]|
+----+---------+------------+-----+------+------------+----------+------+-----------------------

In [115]:
print(df_features.count(), len(df_features.columns))

392 9


In [116]:
df_model = df_features.select('features', 'mpg')

In [117]:
df_model.show(5,False)

+--------------------------------------+----+
|features                              |mpg |
+--------------------------------------+----+
|[8.0,307.0,130.0,3504.0,12.0,70.0,1.0]|18.0|
|[8.0,350.0,165.0,3693.0,11.5,70.0,1.0]|15.0|
|[8.0,318.0,150.0,3436.0,11.0,70.0,1.0]|18.0|
|[8.0,304.0,150.0,3433.0,12.0,70.0,1.0]|16.0|
|[8.0,302.0,140.0,3449.0,10.5,70.0,1.0]|17.0|
+--------------------------------------+----+
only showing top 5 rows



In [118]:
print(df_model.count(), len(df_model.columns))

392 2


#### For training the regression model and its subsequent evaluation, generate the training data and test data from your refined AutoMPG dataset, the dataset will be split 80/20 so that 80% of the data will be used to train the model and the remaining 20% to evaluate the model. Provide a shape for each dataset.

In [119]:
train_df, test_df = df_model.randomSplit([0.80, 0.20])

In [120]:
print(train_df.count(), len(train_df.columns))

323 2


In [121]:
train_df.show(5, False)

+-------------------------------------+----+
|features                             |mpg |
+-------------------------------------+----+
|[3.0,70.0,90.0,2124.0,13.5,73.0,3.0] |18.0|
|[3.0,70.0,97.0,2330.0,13.5,72.0,3.0] |19.0|
|[3.0,70.0,100.0,2420.0,12.5,80.0,3.0]|23.7|
|[3.0,80.0,110.0,2720.0,13.5,77.0,3.0]|21.5|
|[4.0,68.0,49.0,1867.0,19.5,73.0,2.0] |29.0|
+-------------------------------------+----+
only showing top 5 rows



In [122]:
print(test_df.count(), len(test_df.columns))

69 2


In [123]:
test_df.show(5, False)

+------------------------------------+----+
|features                            |mpg |
+------------------------------------+----+
|[4.0,79.0,67.0,2000.0,16.0,74.0,2.0]|31.0|
|[4.0,81.0,60.0,1760.0,16.1,81.0,3.0]|35.1|
|[4.0,83.0,61.0,2003.0,19.0,74.0,3.0]|32.0|
|[4.0,86.0,64.0,1875.0,16.4,81.0,1.0]|39.0|
|[4.0,86.0,65.0,2019.0,16.4,80.0,3.0]|37.2|
+------------------------------------+----+
only showing top 5 rows



#### Using the training data, evaluate the correlation between mpg and each of the independent Use the training data to fit a regression model to predict mpg given values for the number of cylinders, displacement, hp,  weight, acceleration, model_year and origin. 

In [124]:
from pyspark.ml.regression import LinearRegression

In [125]:
lin_model = LinearRegression(labelCol = 'mpg').fit(train_df)

#### For the trained model, find out the y-intercept value And what are the coefficients for each of the independent variables in the fitted model. 

In [126]:
print(lin_model.intercept)
#y-intercept value

-18.314611243742327


In [127]:
print(lin_model.coefficients)

[-0.2513495647412005,0.01822179013048275,-0.016632031389036225,-0.006672671865984077,0.10859215994827294,0.7550793976016926,1.4188299216433296]


Coeeficient of cylinder : -0.43932763979855943
Coeeficient of displacement :  0.017789853702498543
Coeeficient of hp : -0.00993030935705881
Coeeficient of weight : -0.0068300965930229475
Coeeficient of accelaration :  0.1577163540503644
Coeeficient of model_year :  0.7509857969857413
Coeeficient of origin :  1.2807157574083199


#### For the trained model, print the Mean Sum of Squared Error and R-Square values. What does this tell you about the usefulness of the fitted model to predict mpg? Is the model any good? Remember, R-Square values range from 0 to 1.00; closer to 1.00 the better the predictive power of the model. 

In [128]:
training_predictions = lin_model.evaluate(train_df)

In [129]:
print(training_predictions.r2)

0.816505814775914


In [130]:
print(training_predictions.meanSquaredError)

10.977391460650379


Usefulness of the fitted model - R-squared value implies it has better prediction power; It is a good model

#### Now that you have trained your model, evaluate it using the test data. Using the values of R-Square and Mean sum of Squared Error, what can you say about the reliability of the trained model to predict mpg with test data. 

In [131]:
test_predictions = lin_model.evaluate(test_df)

In [132]:
print(test_predictions.r2)


0.8377414126175153


In [133]:
print(test_predictions.meanSquaredError)

10.414349825372787


Test and trained models have similar r2 and mean squared error values, hence we can say it is reliable.