In [1]:
!pip install pyspark faker


Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faker
  Downloading Faker-30.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-30.1.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=76a6a12eef6d2757244fa030e654a9005cc8b36222838f72902ed1c0f235e2eb
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark, faker
Successfully installed faker-30.1.0 pyspark-3.5.3


In [33]:
# importando bibliotecas
from pyspark.sql import SparkSession
from faker import Faker
import random
from datetime import datetime

# criando definições
spark = SparkSession.builder.appName('bank_balances').getOrCreate()
fake = Faker()

def generate_data(n):
    data = []
    for _ in range(n):
        name = fake.name()
        balance = round(random.uniform(1000, 10000), 2)
        account_number = fake.bban()
        city = fake.city()
        birth_date = fake.date_of_birth(minimum_age=18, maximum_age=80)  # Mantém como datetime
        age = datetime.today().year - birth_date.year  # Calcula a idade
        create_account_date = fake.date_between(start_date='-1y', end_date='today').strftime('%d-%m-%Y')
        data.append((name, balance, account_number, city, age, create_account_date, birth_date.strftime('%d-%m-%Y')))  # Converte a data de nascimento para string
    return data

# Gerando os dados
dados = generate_data(1000)

# Definindo o schema para o DataFrame
colunas = ['name', 'balance', 'account_number', 'city', 'age', 'create_account_date', 'birth_date']

# Criando o DataFrame PySpark
df = spark.createDataFrame(dados, schema=colunas)

# Exibindo os 10 primeiros registros
df.show(10)


+--------------+-------+------------------+-----------------+---+-------------------+----------+
|          name|balance|    account_number|             city|age|create_account_date|birth_date|
+--------------+-------+------------------+-----------------+---+-------------------+----------+
|   Mariah Bell| 8431.2|FVJT08570298056712|      Rhondamouth| 42|         20-04-2024|30-06-1982|
|Sheila Roberts|6635.19|ICKK54157695253346|     Lake Richard| 33|         25-02-2024|03-11-1991|
|Madison Miller|6479.11|AVAY52861583466313|       Lake Nancy| 23|         12-01-2024|19-08-2001|
|   Maria Jones| 1589.0|YVHP89564785564716|          Kimbury| 35|         29-05-2024|07-03-1989|
|  Brian Wilson|6334.86|YEQX76338957272343|       Hollymouth| 46|         08-12-2023|20-09-1978|
|Andrea Ramirez|9037.53|HIIH67285643751977| Port Krystalland| 26|         29-03-2024|25-08-1998|
|   Tina Mercer|9076.84|UWSI26959321025454|West Destinyburgh| 21|         02-04-2024|26-03-2003|
|  Brent Benson|3952.82|FNXB60

In [34]:
df.count()

1000

In [35]:
# ordenação dos dados pelo saldo

df_ordenado = df.orderBy(df.balance.desc()).limit(10)

# <.limit(10)> limita a 10 o tamanho da lista... podendo ser usado qualquer valor.

df_ordenado.show()

+-----------------+-------+------------------+-----------------+---+-------------------+----------+
|             name|balance|    account_number|             city|age|create_account_date|birth_date|
+-----------------+-------+------------------+-----------------+---+-------------------+----------+
| Shannon Jones MD| 9999.5|HBKA86235763589584|  Lake Danielfort| 29|         03-10-2024|09-06-1995|
|       Sarah Holt|9992.83|WCSH50826778343841|      South Karen| 75|         01-07-2024|20-04-1949|
|Catherine Jackson| 9976.7|LKBY82016880890963|       South Erin| 79|         10-01-2024|01-04-1945|
|     Cynthia Ryan| 9974.0|OFFS86600258127549|      Lake Alyssa| 21|         09-02-2024|07-12-2003|
|    Victor Holmes|9969.27|VRWA22089232448179|West Mariaborough| 31|         07-01-2024|19-05-1993|
| Matthew Gonzalez|9945.43|DFBV58689997439701|        Derekbury| 79|         25-01-2024|06-07-1945|
|    Kimberly Lamb|9942.51|VGKU92582111814211|   Port Kellyberg| 77|         13-08-2024|07-12-1947|


In [36]:
# Busca de contas com saldo maior que um valor específico.

valor = 5000

df_valor_acima = df.filter(df.balance > valor)

df_valor_acima.show()

+----------------+-------+------------------+-------------------+---+-------------------+----------+
|            name|balance|    account_number|               city|age|create_account_date|birth_date|
+----------------+-------+------------------+-------------------+---+-------------------+----------+
|     Mariah Bell| 8431.2|FVJT08570298056712|        Rhondamouth| 42|         20-04-2024|30-06-1982|
|  Sheila Roberts|6635.19|ICKK54157695253346|       Lake Richard| 33|         25-02-2024|03-11-1991|
|  Madison Miller|6479.11|AVAY52861583466313|         Lake Nancy| 23|         12-01-2024|19-08-2001|
|    Brian Wilson|6334.86|YEQX76338957272343|         Hollymouth| 46|         08-12-2023|20-09-1978|
|  Andrea Ramirez|9037.53|HIIH67285643751977|   Port Krystalland| 26|         29-03-2024|25-08-1998|
|     Tina Mercer|9076.84|UWSI26959321025454|  West Destinyburgh| 21|         02-04-2024|26-03-2003|
|    Joseph Smith| 9780.6|UPEJ36505828498570|     Lake Shawnland| 60|         04-05-2024|15

In [37]:
df_valor_acima.count()

574

In [38]:
# Busca de cliente com uma idade maior que uma específica.

idade = 60

df_idade_acima = df.filter(df.age > idade)

df_idade_acima.show(10)
print('-----'*50)
print('clientes acima de', idade, "=", df_idade_acima.count())

+----------------+-------+------------------+-------------------+---+-------------------+----------+
|            name|balance|    account_number|               city|age|create_account_date|birth_date|
+----------------+-------+------------------+-------------------+---+-------------------+----------+
|Kathleen Mcmahon|7070.65|PKFF08458017511469|         East Karen| 66|         21-02-2024|01-10-1958|
|    Kelsey Evans|7698.03|QXDN29692916140243|      Denisechester| 69|         28-07-2024|31-03-1955|
|    Nancy Wilson| 4056.3|FUVA56228966373843|           Ericfurt| 66|         29-08-2024|26-12-1958|
|   Victoria Shaw| 5289.5|TGNQ07890788214434|          Brianside| 79|         21-03-2024|20-01-1945|
|  Kayla Williams|4021.67|NKTD77127444065313|         Port Laura| 64|         27-05-2024|15-12-1960|
|   Lisa Fletcher|4790.93|WORU74390143181891|          Nunezland| 66|         29-02-2024|07-07-1958|
|    Craig Torres|1612.86|NBQI80685921451381|       East Christy| 68|         20-01-2024|26

In [39]:
# Definindo os limites de idade e criando a condição para saldo maior
idade_min = 30
idade_max = 50

# Filtrando os clientes com idade entre 30 e 50 anos e saldo acima de um valor alto (por exemplo, 8000)
df_clientes_filtrados = df.filter((df.age >= idade_min) & (df.age <= idade_max)).orderBy(df.balance.desc())

# Exibindo os 10 clientes com maior saldo dentro da faixa etária
df_clientes_filtrados.show(10)

# Exibindo o número total de clientes nessa condição
print('-----'*50)
print('Total de clientes entre', idade_min, 'e', idade_max, 'anos =', df_clientes_filtrados.count())



+----------------+-------+------------------+-----------------+---+-------------------+----------+
|            name|balance|    account_number|             city|age|create_account_date|birth_date|
+----------------+-------+------------------+-----------------+---+-------------------+----------+
|   Victor Holmes|9969.27|VRWA22089232448179|West Mariaborough| 31|         07-01-2024|19-05-1993|
|   Paul Martinez|9936.72|SNRY15657654648739|      West Sheila| 48|         15-10-2023|22-11-1976|
| Nicholas Murray|9911.97|OAMF45407488537824|     Rebeccaville| 35|         30-08-2024|10-01-1989|
|  David Richmond|9905.37|VPFO15643918074331|      West Carrie| 31|         23-02-2024|24-05-1993|
|Christy Martinez|9872.07|LDWC18619702737843|     Williamsstad| 40|         26-05-2024|02-12-1984|
|   Kelly Stewart|9864.23|FYFY50510403240588|         Ericstad| 30|         30-10-2023|30-10-1994|
|   Michael Moore|9838.46|DFBH12930467284233|   North Johnport| 33|         23-12-2023|13-12-1991|
|  Patrici