In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('Co2').getOrCreate()

In [0]:
df = spark.read.csv('dbfs:/FileStore/tables/FuelConsumption.csv',inferSchema=True,header=True)

In [0]:
df.show(5)

+---------+-----+----------+------------+----------+---------+------------+--------+--------------------+-------------------+--------------------+------------------------+--------------+----+--------------+----+----+----+----+----+----+----+----+----+
|MODELYEAR| MAKE|     MODEL|VEHICLECLASS|ENGINESIZE|CYLINDERS|TRANSMISSION|FUELTYPE|FUELCONSUMPTION_CITY|FUELCONSUMPTION_HWY|FUELCONSUMPTION_COMB|FUELCONSUMPTION_COMB_MPG|CO2EMISSIONS12|_c13|CO2EMISSIONS14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|
+---------+-----+----------+------------+----------+---------+------------+--------+--------------------+-------------------+--------------------+------------------------+--------------+----+--------------+----+----+----+----+----+----+----+----+----+
|     2014|ACURA|       ILX|     COMPACT|       2.0|        4|         AS5|       Z|                 9.9|                6.7|                 8.5|                      33|           196|null|          null|null|null|null|null|null|null|null|nul

In [0]:
df.printSchema()

root
 |-- MODELYEAR: integer (nullable = true)
 |-- MAKE: string (nullable = true)
 |-- MODEL: string (nullable = true)
 |-- VEHICLECLASS: string (nullable = true)
 |-- ENGINESIZE: double (nullable = true)
 |-- CYLINDERS: integer (nullable = true)
 |-- TRANSMISSION: string (nullable = true)
 |-- FUELTYPE: string (nullable = true)
 |-- FUELCONSUMPTION_CITY: double (nullable = true)
 |-- FUELCONSUMPTION_HWY: double (nullable = true)
 |-- FUELCONSUMPTION_COMB: double (nullable = true)
 |-- FUELCONSUMPTION_COMB_MPG: integer (nullable = true)
 |-- CO2EMISSIONS12: integer (nullable = true)
 |-- _c13: string (nullable = true)
 |-- CO2EMISSIONS14: string (nullable = true)
 |-- _c15: double (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)
 |-- _c18: string (nullable = true)
 |-- _c19: string (nullable = true)
 |-- _c20: string (nullable = true)
 |-- _c21: string (nullable = true)
 |-- _c22: string (nullable = true)
 |-- _c23: string (nullable = true)



In [0]:
df.columns

Out[9]: ['MODELYEAR',
 'MAKE',
 'MODEL',
 'VEHICLECLASS',
 'ENGINESIZE',
 'CYLINDERS',
 'TRANSMISSION',
 'FUELTYPE',
 'FUELCONSUMPTION_CITY',
 'FUELCONSUMPTION_HWY',
 'FUELCONSUMPTION_COMB',
 'FUELCONSUMPTION_COMB_MPG',
 'CO2EMISSIONS12',
 '_c13',
 'CO2EMISSIONS14',
 '_c15',
 '_c16',
 '_c17',
 '_c18',
 '_c19',
 '_c20',
 '_c21',
 '_c22',
 '_c23']

In [0]:
df.select('CO2EMISSIONS14').distinct().show()

+--------------+
|CO2EMISSIONS14|
+--------------+
|   0.243761352|
|          null|
|          Mean|
|           300|
|          Skew|
|           STD|
|   0.755123306|
|        Median|
|   449.9208237|
|      Variance|
|          Mode|
|   0.998884659|
+--------------+



In [0]:
Selected_Features = [ 'MODELYEAR','ENGINESIZE','CYLINDERS','FUELTYPE','FUELCONSUMPTION_CITY','FUELCONSUMPTION_HWY',
 'FUELCONSUMPTION_COMB','FUELCONSUMPTION_COMB_MPG','CO2EMISSIONS12']

In [0]:
for feature in Selected_Features:
    print(feature,df.where(df[feature].isNull()).count())
    print()

MODELYEAR 0

ENGINESIZE 0

CYLINDERS 0

FUELTYPE 0

FUELCONSUMPTION_CITY 0

FUELCONSUMPTION_HWY 0

FUELCONSUMPTION_COMB 0

FUELCONSUMPTION_COMB_MPG 0

CO2EMISSIONS12 0



In [0]:
data = df.select(Selected_Features)

In [0]:
data.printSchema()

root
 |-- MODELYEAR: integer (nullable = true)
 |-- ENGINESIZE: double (nullable = true)
 |-- CYLINDERS: integer (nullable = true)
 |-- FUELTYPE: string (nullable = true)
 |-- FUELCONSUMPTION_CITY: double (nullable = true)
 |-- FUELCONSUMPTION_HWY: double (nullable = true)
 |-- FUELCONSUMPTION_COMB: double (nullable = true)
 |-- FUELCONSUMPTION_COMB_MPG: integer (nullable = true)
 |-- CO2EMISSIONS12: integer (nullable = true)



In [0]:
data = spark.createDataFrame(pandas_Data)

In [0]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

In [0]:
assembler = VectorAssembler(inputCols=['ENGINESIZE',
 'CYLINDERS',
 'FUELCONSUMPTION_CITY',
 'FUELCONSUMPTION_HWY',
 'FUELCONSUMPTION_COMB',
 'FUELCONSUMPTION_COMB_MPG',
 'FUELTYPE_E',
 'FUELTYPE_X',
 'FUELTYPE_Z'],outputCol='features')

In [0]:
output = assembler.transform(data)

In [0]:
train,test = output.select('features',col('CO2EMISSIONS12').alias('label')).randomSplit(weights=[0.7,0.3])

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
train.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.5,4.0,6.0,5.8,...|  136|
|[2.0,4.0,7.6,5.5,...|  181|
|[2.0,4.0,9.9,6.7,...|  196|
|[2.0,4.0,9.9,7.4,...|  202|
|[2.0,4.0,9.9,8.3,...|  212|
+--------------------+-----+
only showing top 5 rows



In [0]:
lr = LinearRegression()

# Fit the model
lrModel = lr.fit(train)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

Coefficients: [0.5393658601606603,1.472076665871427,1.2871774673974758,0.9792321313982494,15.73814999179503,-1.3903867180691372,-142.9925665957087,-33.24602827863185,-34.02246050175956]
Intercept: 116.01545389283972


In [0]:
lrModel.evaluate(train).r2

Out[105]: 0.9928079150241511

In [0]:
lrModel.evaluate(test).r2

Out[104]: 0.9923397000803653

In [0]:
df.select('CO2EMISSIONS12').describe().show()

+-------+-----------------+
|summary|   CO2EMISSIONS12|
+-------+-----------------+
|  count|             1067|
|   mean|256.2286785379569|
| stddev|63.37230444279997|
|    min|              108|
|    max|              488|
+-------+-----------------+



In [0]:
lrModel.evaluate(train).rootMeanSquaredError,lrModel.evaluate(test).rootMeanSquaredError

Out[108]: (5.245710500807938, 5.832233797076175)