<a href="https://colab.research.google.com/github/Omjade/RegressionModel/blob/main/APACHE_SPARK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

/bin/bash: line 1: Apip: command not found


In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("Basics").getOrCreate()

In [None]:
data = [("Alice", 29), ("Bob", 35), ("Cathy", 24)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

In [None]:
df.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 29|
|  Bob| 35|
|Cathy| 24|
+-----+---+



In [13]:
data  = spark.read.csv("./sample_data/california_housing_train.csv", header=True, inferSchema=True)



In [14]:
data.head()

Row(longitude=-114.31, latitude=34.19, housing_median_age=15.0, total_rooms=5612.0, total_bedrooms=1283.0, population=1015.0, households=472.0, median_income=1.4936, median_house_value=66900.0)

In [15]:
data.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -114.31|   34.19|              15.0|     5612.0|        1283.0|    1015.0|     472.0|       1.4936|           66900.0|
|  -114.47|    34.4|              19.0|     7650.0|        1901.0|    1129.0|     463.0|         1.82|           80100.0|
|  -114.56|   33.69|              17.0|      720.0|         174.0|     333.0|     117.0|       1.6509|           85700.0|
|  -114.57|   33.64|              14.0|     1501.0|         337.0|     515.0|     226.0|       3.1917|           73400.0|
|  -114.57|   33.57|              20.0|     1454.0|         326.0|     624.0|     262.0|        1.925|           65500.0|
|  -114.58|   33.63|    

In [16]:
data.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)



In [17]:
data.describe().show()

+-------+-------------------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+
|summary|          longitude|          latitude|housing_median_age|      total_rooms|   total_bedrooms|        population|       households|     median_income|median_house_value|
+-------+-------------------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+
|  count|              17000|             17000|             17000|            17000|            17000|             17000|            17000|             17000|             17000|
|   mean|-119.56210823529375|  35.6252247058827| 28.58935294117647|2643.664411764706|539.4108235294118|1429.5739411764705|501.2219411764706| 3.883578100000021|207300.91235294117|
| stddev| 2.0051664084260357|2.1373397946570867|12.586936981660406|2179.947071452777|421.4994515798648| 1

In [18]:
data.count()

17000

In [19]:
df.select('Name').show()

+-----+
| Name|
+-----+
|Alice|
|  Bob|
|Cathy|
+-----+



In [20]:
df.filter(df.Age >30).show()

+----+---+
|Name|Age|
+----+---+
| Bob| 35|
+----+---+



In [21]:
df.createOrReplaceTempView("people")
spark.sql("SELECT * FROM people WHERE Age > 30").show()


+----+---+
|Name|Age|
+----+---+
| Bob| 35|
+----+---+



In [22]:
df1 = spark.createDataFrame([(1, "Alice"), (2, "Bob")], ["ID", "Name"])
df2 = spark.createDataFrame([(1, "F"), (2, "M")], ["ID", "Gender"])
df1.join(df2, "ID").show()


+---+-----+------+
| ID| Name|Gender|
+---+-----+------+
|  1|Alice|     F|
|  2|  Bob|     M|
+---+-----+------+



In [23]:
df.groupby('Age').count().show()

+---+-----+
|Age|count|
+---+-----+
| 29|    1|
| 35|    1|
| 24|    1|
+---+-----+



In [24]:
df.fillna(0).show()


+-----+---+
| Name|Age|
+-----+---+
|Alice| 29|
|  Bob| 35|
|Cathy| 24|
+-----+---+



In [33]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

# Step 1: Create a Spark Session
spark = SparkSession.builder.appName("Linear Regression Example").getOrCreate()

# Step 2: Prepare the Dataset
# Sample data: Features (X1, X2) and Target (Y)
data = [
    (1.0, 2.0, 4.0),
    (2.0, 1.0, 5.0),
    (3.0, 3.0, 7.0),
    (4.0, 2.0, 10.0),
    (5.0, 3.0, 12.0)
]
columns = ["Feature1", "Feature2", "Target"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)
df.show()

# Step 3: Feature Engineering (Combine Features)
# VectorAssembler combines multiple feature columns into a single vector column
assembler = VectorAssembler(inputCols=["Feature1", "Feature2"], outputCol="features")
vectorized_df = assembler.transform(df)

# View the transformed data
vectorized_df.select("features", "Target").show()

# Step 4: Create and Train the Linear Regression Model
lr = LinearRegression(featuresCol="features", labelCol="Target")
lr_model = lr.fit(vectorized_df)

# Step 5: Model Summary and Results
print("Coefficients: ", lr_model.coefficients)
print("Intercept: ", lr_model.intercept)

# Step 6: Predictions
predictions = lr_model.transform(vectorized_df)
predictions.select("features", "Target", "prediction").show()

# Stop the Spark Session
spark.stop()


+--------+--------+------+
|Feature1|Feature2|Target|
+--------+--------+------+
|     1.0|     2.0|   4.0|
|     2.0|     1.0|   5.0|
|     3.0|     3.0|   7.0|
|     4.0|     2.0|  10.0|
|     5.0|     3.0|  12.0|
+--------+--------+------+

+---------+------+
| features|Target|
+---------+------+
|[1.0,2.0]|   4.0|
|[2.0,1.0]|   5.0|
|[3.0,3.0]|   7.0|
|[4.0,2.0]|  10.0|
|[5.0,3.0]|  12.0|
+---------+------+

Coefficients:  [2.0842105263157915,0.05263157894736627]
Intercept:  1.2315789473684189
+---------+------+------------------+
| features|Target|        prediction|
+---------+------+------------------+
|[1.0,2.0]|   4.0|3.4210526315789433|
|[2.0,1.0]|   5.0|5.4526315789473685|
|[3.0,3.0]|   7.0| 7.642105263157893|
|[4.0,2.0]|  10.0| 9.673684210526318|
|[5.0,3.0]|  12.0|11.810526315789476|
+---------+------+------------------+



In [28]:
df

DataFrame[Name: string, Age: bigint]