# Aula 1 - Conhecendo o Spark

In [None]:
#! pip install pyspark

In [None]:
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
import  findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
df = spark.sql('''select 'Sucesso Total, estamos online' as hello ''')
df.show()

+--------------------+
|               hello|
+--------------------+
|Sucesso Total, es...|
+--------------------+



In [None]:
# Import spark libraries
from pyspark.sql import Row, DataFrame
from pyspark.sql.types import StringType, StructType, StructField, IntegerType
from pyspark.sql.functions import col, expr, lit, substring, concat, concat_ws, when, coalesce
from pyspark.sql import functions as F # for mor sql functions
from functools import reduce

# Data Manipulation using Park

In [None]:
import requests
import pandas as pd
import io

url = "https://raw.githubusercontent.com/SandraRojasZ/Pos_Tech_Data_Analytics/main/Base_de_Dados/banklist.csv"
#df = spark.read.csv('banklist.csv', sep = ',', inferSchema = True, header = True)
response = requests.get(url)
response.raise_for_status()  # Raise an exception for bad status codes

# Convert the data to a Pandas DataFrame
data = response.text
df_pandas = pd.read_csv(io.StringIO(data))

In [None]:
df = spark.createDataFrame(df_pandas)

print('df.count :', df.count())
print('df.col ct :', len(df.columns))
print('df.columns:', df.columns)

df.count : 561
df.col ct : 6
df.columns: ['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution', 'Closing Date']


# Using SQL in PySpark

In [None]:
df.createOrReplaceTempView("banklist")

df_check = spark.sql('''select `Bank Name`, City, `Closing Date` from banklist''')
#df_check.show(4, truncate = False)
df_check.show()

+--------------------+------------------+------------+
|           Bank Name|              City|Closing Date|
+--------------------+------------------+------------+
|The First State Bank|     Barboursville|    3-Apr-20|
|  Ericson State Bank|           Ericson|   14-Feb-20|
|City National Ban...|            Newark|    1-Nov-19|
|       Resolute Bank|            Maumee|   25-Oct-19|
|Louisa Community ...|            Louisa|   25-Oct-19|
|The Enloe State Bank|            Cooper|   31-May-19|
|Washington Federa...|           Chicago|   15-Dec-17|
|The Farmers and M...|           Argonia|   13-Oct-17|
| Fayette County Bank|        Saint Elmo|   26-May-17|
|Guaranty Bank, (d...|         Milwaukee|    5-May-17|
|      First NBC Bank|       New Orleans|   28-Apr-17|
|       Proficio Bank|Cottonwood Heights|    3-Mar-17|
|Seaway Bank and T...|           Chicago|   27-Jan-17|
|Harvest Community...|        Pennsville|   13-Jan-17|
|         Allied Bank|          Mulberry|   23-Sep-16|
|The Woodb

# DataFrame Basic Operations

In [None]:
df.describe().show()

+-------+--------------------+-------+----+------------------+---------------------+------------+
|summary|           Bank Name|   City|  ST|              CERT|Acquiring Institution|Closing Date|
+-------+--------------------+-------+----+------------------+---------------------+------------+
|  count|                 561|    561| 561|               561|                  561|         561|
|   mean|                NULL|   NULL|NULL| 31685.68449197861|                 NULL|        NULL|
| stddev|                NULL|   NULL|NULL|16446.656593099655|                 NULL|        NULL|
|    min|1st American Stat...|Acworth|  AL|                91|      1st United Bank|    1-Aug-08|
|    max|               ebank|Wyoming|  WY|             58701|  Your Community Bank|    9-Sep-11|
+-------+--------------------+-------+----+------------------+---------------------+------------+



In [None]:
df.describe('City', 'ST').show()

+-------+-------+----+
|summary|   City|  ST|
+-------+-------+----+
|  count|    561| 561|
|   mean|   NULL|NULL|
| stddev|   NULL|NULL|
|    min|Acworth|  AL|
|    max|Wyoming|  WY|
+-------+-------+----+



# Count, Columns and Schema

In [None]:
print('Total de linhas:', df.count())
print('Total de colunas:', len(df.columns))
print('Colunas:', df.columns)
print('Tipo de Dados:', df.dtypes)
print('Schema:', df.schema)

Total de linhas: 561
Total de colunas: 6
Colunas: ['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution', 'Closing Date']
Tipo de Dados: [('Bank Name', 'string'), ('City', 'string'), ('ST', 'string'), ('CERT', 'bigint'), ('Acquiring Institution', 'string'), ('Closing Date', 'string')]
Schema: StructType([StructField('Bank Name', StringType(), True), StructField('City', StringType(), True), StructField('ST', StringType(), True), StructField('CERT', LongType(), True), StructField('Acquiring Institution', StringType(), True), StructField('Closing Date', StringType(), True)])


In [None]:
df.printSchema()

root
 |-- Bank Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- ST: string (nullable = true)
 |-- CERT: long (nullable = true)
 |-- Acquiring Institution: string (nullable = true)
 |-- Closing Date: string (nullable = true)



# Remove duplicates

In [None]:
df = df.dropDuplicates()
print('df.count:', df.count())
print('df.columns:', df.columns)

df.count: 561
df.columns: ['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution', 'Closing Date']


# Select Specific Columns

In [None]:
df2 = df.select(*['Bank Name', 'City'])
df2.show(2)

+--------------------+-------------+
|           Bank Name|         City|
+--------------------+-------------+
|The First State Bank|Barboursville|
|        Coastal Bank|  Cocoa Beach|
+--------------------+-------------+
only showing top 2 rows



# Select Multiple Columns

In [None]:
col_l = list(set(df.columns) - {'CERT', 'ST'})
df2 = df.select(*col_l)
df2.show(2)

+-------------+------------+--------------------+---------------------+
|         City|Closing Date|           Bank Name|Acquiring Institution|
+-------------+------------+--------------------+---------------------+
|Barboursville|    3-Apr-20|The First State Bank|       MVB Bank, Inc.|
|  Cocoa Beach|    6-May-11|        Coastal Bank| Florida Community...|
+-------------+------------+--------------------+---------------------+
only showing top 2 rows



# Rename Columns

In [None]:
df2 = df \
    .withColumnRenamed('Bank Name', 'bank_name') \
    .withColumnRenamed('Acquiring Institution', 'acq_institution') \
    .withColumnRenamed('Closing Date', 'closing_date') \
    .withColumnRenamed('ST', 'state') \
    .withColumnRenamed('CERT', 'cert') #\

df2.show(2)

+--------------------+-------------+-----+-----+--------------------+------------+
|           bank_name|         City|state| cert|     acq_institution|closing_date|
+--------------------+-------------+-----+-----+--------------------+------------+
|The First State Bank|Barboursville|   WV|14361|      MVB Bank, Inc.|    3-Apr-20|
|        Coastal Bank|  Cocoa Beach|   FL|34898|Florida Community...|    6-May-11|
+--------------------+-------------+-----+-----+--------------------+------------+
only showing top 2 rows



# Add Columns

In [None]:
df2= df.withColumn('state', col('ST'))
df2.show(2)

+--------------------+-------------+---+-----+---------------------+------------+-----+
|           Bank Name|         City| ST| CERT|Acquiring Institution|Closing Date|state|
+--------------------+-------------+---+-----+---------------------+------------+-----+
|The First State Bank|Barboursville| WV|14361|       MVB Bank, Inc.|    3-Apr-20|   WV|
|        Coastal Bank|  Cocoa Beach| FL|34898| Florida Community...|    6-May-11|   FL|
+--------------------+-------------+---+-----+---------------------+------------+-----+
only showing top 2 rows



# Add constant column

In [None]:
df2 = df.withColumn('country', lit('US'))
df2.show(2)

+--------------------+-------------+---+-----+---------------------+------------+-------+
|           Bank Name|         City| ST| CERT|Acquiring Institution|Closing Date|country|
+--------------------+-------------+---+-----+---------------------+------------+-------+
|The First State Bank|Barboursville| WV|14361|       MVB Bank, Inc.|    3-Apr-20|     US|
|        Coastal Bank|  Cocoa Beach| FL|34898| Florida Community...|    6-May-11|     US|
+--------------------+-------------+---+-----+---------------------+------------+-------+
only showing top 2 rows



# Drop Columns

In [None]:
df2 = df.drop('CERT')
df2.show(2)

+--------------------+-------------+---+---------------------+------------+
|           Bank Name|         City| ST|Acquiring Institution|Closing Date|
+--------------------+-------------+---+---------------------+------------+
|The First State Bank|Barboursville| WV|       MVB Bank, Inc.|    3-Apr-20|
|        Coastal Bank|  Cocoa Beach| FL| Florida Community...|    6-May-11|
+--------------------+-------------+---+---------------------+------------+
only showing top 2 rows



# Drop Multiple Columns

In [None]:
df2 = df.drop(*['CERT', 'ST'])
df2.show(2)

+--------------------+-------------+---------------------+------------+
|           Bank Name|         City|Acquiring Institution|Closing Date|
+--------------------+-------------+---------------------+------------+
|The First State Bank|Barboursville|       MVB Bank, Inc.|    3-Apr-20|
|        Coastal Bank|  Cocoa Beach| Florida Community...|    6-May-11|
+--------------------+-------------+---------------------+------------+
only showing top 2 rows



In [None]:
df2 = reduce(DataFrame.drop, ['CERT', 'ST'], df)
df2.show(2)

+--------------------+-------------+---------------------+------------+
|           Bank Name|         City|Acquiring Institution|Closing Date|
+--------------------+-------------+---------------------+------------+
|The First State Bank|Barboursville|       MVB Bank, Inc.|    3-Apr-20|
|        Coastal Bank|  Cocoa Beach| Florida Community...|    6-May-11|
+--------------------+-------------+---------------------+------------+
only showing top 2 rows



# Filter Data

In [None]:
# Equal to values
df2 = df.where(df['ST'] == 'NE')

# Between values
df3 = df.where(df['CERT'].between('1000', '2000'))

# Is inside multiple values
df4 = df.where(df['ST'].isin('NE', 'IL'))

print('df.count:', df.count())
print('df2.count:', df2.count())
print('df3.count:', df3.count())
print('df4.count:', df4.count())

df.count: 561
df2.count: 4
df3.count: 9
df4.count: 73


# Filter data using logical operators

In [None]:
df2 = df.where((df['ST'] == 'NE') & (df['City'] == 'Ericson'))
df2.show(3)

+------------------+-------+---+-----+---------------------+------------+
|         Bank Name|   City| ST| CERT|Acquiring Institution|Closing Date|
+------------------+-------+---+-----+---------------------+------------+
|Ericson State Bank|Ericson| NE|18265| Farmers and Merch...|   14-Feb-20|
+------------------+-------+---+-----+---------------------+------------+



# Replace values in DataFrame

In [None]:
# Pre replace
df.show(2)

# Post replace
print('Replace 7 in the above dataframe with 17 at all instances')
df.na.replace(7,17).show(2)

+--------------------+-------------+---+-----+---------------------+------------+
|           Bank Name|         City| ST| CERT|Acquiring Institution|Closing Date|
+--------------------+-------------+---+-----+---------------------+------------+
|The First State Bank|Barboursville| WV|14361|       MVB Bank, Inc.|    3-Apr-20|
|        Coastal Bank|  Cocoa Beach| FL|34898| Florida Community...|    6-May-11|
+--------------------+-------------+---+-----+---------------------+------------+
only showing top 2 rows

Replace 7 in the above dataframe with 17 at all instances
+--------------------+-------------+---+-----+---------------------+------------+
|           Bank Name|         City| ST| CERT|Acquiring Institution|Closing Date|
+--------------------+-------------+---+-----+---------------------+------------+
|The First State Bank|Barboursville| WV|14361|       MVB Bank, Inc.|    3-Apr-20|
|        Coastal Bank|  Cocoa Beach| FL|34898| Florida Community...|    6-May-11|
+--------------