<a href="https://colab.research.google.com/github/Pedro-hn/Pyspark/blob/main/Pyspark_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ml").getOrCreate()

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression


In [None]:
df = spark.read.csv("train.csv", header= True, inferSchema= True)

In [None]:
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [None]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [None]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [None]:
df = df.select("SibSp", "Age", "Parch", "Survived", "Pclass", "Fare")

In [None]:

featureassembler = VectorAssembler(inputCols=["SibSp", "Parch"], outputCol="Independent Features")

In [None]:
output = featureassembler.transform(df)

In [None]:
output.show()

+-----+----+-----+--------+------+-------+--------------------+
|SibSp| Age|Parch|Survived|Pclass|   Fare|Independent Features|
+-----+----+-----+--------+------+-------+--------------------+
|    1|22.0|    0|       0|     3|   7.25|           [1.0,0.0]|
|    1|38.0|    0|       1|     1|71.2833|           [1.0,0.0]|
|    0|26.0|    0|       1|     3|  7.925|           (2,[],[])|
|    1|35.0|    0|       1|     1|   53.1|           [1.0,0.0]|
|    0|35.0|    0|       0|     3|   8.05|           (2,[],[])|
|    0|NULL|    0|       0|     3| 8.4583|           (2,[],[])|
|    0|54.0|    0|       0|     1|51.8625|           (2,[],[])|
|    3| 2.0|    1|       0|     3| 21.075|           [3.0,1.0]|
|    0|27.0|    2|       1|     3|11.1333|           [0.0,2.0]|
|    1|14.0|    0|       1|     2|30.0708|           [1.0,0.0]|
|    1| 4.0|    1|       1|     3|   16.7|           [1.0,1.0]|
|    0|58.0|    0|       1|     1|  26.55|           (2,[],[])|
|    0|20.0|    0|       0|     3|   8.0

In [None]:
output.columns

['SibSp', 'Age', 'Parch', 'Survived', 'Pclass', 'Fare', 'Independent Features']

In [None]:
finalized_data = output.select("Fare", "Independent Features")

In [None]:
finalized_data.show(3)

+-------+--------------------+
|   Fare|Independent Features|
+-------+--------------------+
|   7.25|           [1.0,0.0]|
|71.2833|           [1.0,0.0]|
|  7.925|           (2,[],[])|
+-------+--------------------+
only showing top 3 rows



In [None]:
train_data, test_data = finalized_data.randomSplit([.70, .30])

In [None]:
print((train_data.count(), len(train_data.columns)))

(620, 2)


In [None]:
test_data.count()

271

In [None]:
regressor = LinearRegression(featuresCol="Independent Features", labelCol="Fare")
regressor = regressor.fit(train_data)

In [None]:
regressor.coefficients

DenseVector([3.7512, 13.5379])

In [None]:
regressor.intercept

27.190789721236627

In [None]:
predicted = regressor.evaluate(test_data)

In [None]:
predicted.predictions.show()

+------+--------------------+------------------+
|  Fare|Independent Features|        prediction|
+------+--------------------+------------------+
|   0.0|           (2,[],[])|27.190789721236627|
|   0.0|           (2,[],[])|27.190789721236627|
|   0.0|           (2,[],[])|27.190789721236627|
|   0.0|           (2,[],[])|27.190789721236627|
|  6.75|           (2,[],[])|27.190789721236627|
|  7.05|           (2,[],[])|27.190789721236627|
|  7.05|           (2,[],[])|27.190789721236627|
|7.0542|           (2,[],[])|27.190789721236627|
|7.0542|           [1.0,0.0]| 30.94195274578732|
| 7.125|           (2,[],[])|27.190789721236627|
| 7.125|           (2,[],[])|27.190789721236627|
| 7.225|           (2,[],[])|27.190789721236627|
| 7.225|           (2,[],[])|27.190789721236627|
| 7.225|           (2,[],[])|27.190789721236627|
|7.2292|           (2,[],[])|27.190789721236627|
|7.2292|           (2,[],[])|27.190789721236627|
|7.2292|           (2,[],[])|27.190789721236627|
|7.2292|           (

In [None]:
predicted.meanAbsoluteError, predicted.meanSquaredError

(24.548502597296153, 1041.339930335139)