# In this tutorial's part-1, I covered the following:


* Installation of Spark
* Reading the Dataset
* Checking the Datatypes of the Column (Schema)
* Selecting Column and Indexing
* Check Generate descriptive statistics
* Adding Columns 
* Dropping Columns
* Renaming Columns


# Old Installation Method of Spark
Currently not needed due to the pyspark library of python. It is just for demonstration purposes.


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-3.0.2/spark-3.0.2-bin-hadoop2.7.tgz

In [None]:
!tar xf spark-3.0.2-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.2-bin-hadoop2.7"

In [None]:
import findspark
findspark.init()

# New Installation Method of Spark
We are using this method for our tutorial

In [None]:
!pip install pyspark py4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 KB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=b5e0e1110198e5e16b377e56e3f7c2cd3bbc958dd6ad5f302b6682a592c2ce77
 

# Building the PySpark Session

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("tutorial-1(p-1)").getOrCreate()

In [None]:
spark

#Reading the Dataset

In [None]:
df = spark.read.csv("/content/drive/MyDrive/PGD_BigData/Spark/Datasets/simple.csv", header=True, inferSchema=True)

In [None]:
df.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|  Ram| 26|         7|
|Shaym| 28|         8|
|Madhu| 32|        12|
|Krish| 38|        16|
|Sunny| 31|        11|
+-----+---+----------+



#Checking the Schema
It is commonly known as data type

In [None]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [None]:
# checking the dataframe
type(df)

pyspark.sql.dataframe.DataFrame

In [None]:
df.head(3)

[Row(Name='Ram', Age=26, Experience=7),
 Row(Name='Shaym', Age=28, Experience=8),
 Row(Name='Madhu', Age=32, Experience=12)]

#Selecting Columns and Indexing

In [None]:
df.select(['Name']).show()

+-----+
| Name|
+-----+
|  Ram|
|Shaym|
|Madhu|
|Krish|
|Sunny|
+-----+



In [None]:
df.select(['Name','Experience']).show()

+-----+----------+
| Name|Experience|
+-----+----------+
|  Ram|         7|
|Shaym|         8|
|Madhu|        12|
|Krish|        16|
|Sunny|        11|
+-----+----------+



# Check Generate descriptive statistics

In [None]:
df.describe().show()

+-------+-----+-----------------+------------------+
|summary| Name|              Age|        Experience|
+-------+-----+-----------------+------------------+
|  count|    5|                5|                 5|
|   mean| null|             31.0|              10.8|
| stddev| null|4.582575694955839|3.5637059362410923|
|    min|Krish|               26|                 7|
|    max|Sunny|               38|                16|
+-------+-----+-----------------+------------------+



# Adding Columns

In [None]:
df=df.withColumn('Experience After 2 year',df['Experience']+2)

In [None]:
df.show()

+-----+---+----------+-----------------------+
| Name|Age|Experience|Experience After 2 year|
+-----+---+----------+-----------------------+
|  Ram| 26|         7|                      9|
|Shaym| 28|         8|                     10|
|Madhu| 32|        12|                     14|
|Krish| 38|        16|                     18|
|Sunny| 31|        11|                     13|
+-----+---+----------+-----------------------+



#Drop the columns

In [None]:
df=df.drop('Experience After 2 year')

In [None]:
df.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|  Ram| 26|         7|
|Shaym| 28|         8|
|Madhu| 32|        12|
|Krish| 38|        16|
|Sunny| 31|        11|
+-----+---+----------+



#Rename the columns

In [None]:
df.withColumnRenamed('Name','New Name').show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|     Ram| 26|         7|
|   Shaym| 28|         8|
|   Madhu| 32|        12|
|   Krish| 38|        16|
|   Sunny| 31|        11|
+--------+---+----------+

