In [1]:
import pyspark

In [2]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors

In [3]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")

In [4]:
sc = pyspark.SparkContext(appName="ML_Examples")

In [5]:
sql = pyspark.SQLContext(sc)

In [6]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [7]:
('setosa',(5.1,3.5,1.4,0.2))

('setosa', (5.1, 3.5, 1.4, 0.2))

In [8]:
species = {'setosa':1,'versicolor':2,'virginica':3}

sdf = [(species[df.irow(x).species],Vectors.sparse(4,[0,1,2,3],[df.irow(x).sepal_length,df.irow(x).sepal_width,
                                     df.irow(x).petal_length,df.irow(x).petal_width])) for x in range(len(df))]

df.irow(0).species



'setosa'

In [12]:
sdf[0]

(1, SparseVector(4, {0: 5.1, 1: 3.5, 2: 1.4, 3: 0.2}))

In [14]:
data = sql.createDataFrame(sdf,schema=['label','features'])

In [15]:
data.sample(False,0.1).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|(4,[0,1,2,3],[4.6...|
|    1|(4,[0,1,2,3],[4.8...|
|    1|(4,[0,1,2,3],[5.5...|
|    1|(4,[0,1,2,3],[4.4...|
|    2|(4,[0,1,2,3],[6.9...|
|    2|(4,[0,1,2,3],[5.9...|
|    2|(4,[0,1,2,3],[6.2...|
|    2|(4,[0,1,2,3],[6.0...|
|    2|(4,[0,1,2,3],[6.3...|
|    3|(4,[0,1,2,3],[6.5...|
|    3|(4,[0,1,2,3],[6.4...|
|    3|(4,[0,1,2,3],[6.4...|
+-----+--------------------+



In [16]:
ln = LinearRegression()

In [17]:
lr = ln.fit(data)

In [18]:
lr.summary.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| 0.08265827249444202|
|  0.0385897564770008|
| 0.04818969137910112|
|-0.01260877605635...|
|  0.0761081708368967|
| -0.0568023484254212|
|-0.03762591579550...|
| 0.04455994327617674|
|-0.02070501984131...|
| 0.08130307490321231|
| 0.10172866260870894|
|-8.84875996338685...|
|  0.0886050221193293|
| 0.10183470512435377|
| 0.22699779659281427|
|0.043640590355711995|
| 0.03399820444334323|
|  0.0216688605228037|
| 0.03268545786238197|
|0.012240856320905547|
+--------------------+
only showing top 20 rows



In [19]:
res = lr.evaluate(data.sample(False,0.1))
res.predictions.collect()

[Row(label=1, features=SparseVector(4, {0: 5.4, 1: 3.9, 2: 1.7, 3: 0.4}), prediction=1.0568023484254212),
 Row(label=1, features=SparseVector(4, {0: 5.0, 1: 3.0, 2: 1.6, 3: 0.2}), prediction=0.9958363736280718),
 Row(label=1, features=SparseVector(4, {0: 4.4, 1: 3.0, 2: 1.3, 3: 0.2}), prediction=0.9935808369523569),
 Row(label=1, features=SparseVector(4, {0: 5.0, 1: 3.5, 2: 1.3, 3: 0.3}), prediction=0.9666051475893149),
 Row(label=1, features=SparseVector(4, {0: 4.6, 1: 3.2, 2: 1.4, 3: 0.2}), prediction=0.9854845931673999),
 Row(label=2, features=SparseVector(4, {0: 6.4, 1: 3.2, 2: 4.5, 3: 1.5}), prediction=2.2845165996040464),
 Row(label=2, features=SparseVector(4, {0: 6.0, 1: 2.7, 2: 5.1, 3: 1.6}), prediction=2.547723649554892),
 Row(label=2, features=SparseVector(4, {0: 5.1, 1: 2.5, 2: 3.0, 3: 1.1}), prediction=1.8736890934430042),
 Row(label=3, features=SparseVector(4, {0: 6.3, 1: 3.3, 2: 6.0, 3: 2.5}), prediction=3.241462894235842),
 Row(label=3, features=SparseVector(4, {0: 5.8, 