In [17]:
from pyspark.sql import SparkSession
import os

os.environ['HADOOP_USER_NAME'] = 'n.almazova'

spark = SparkSession.builder \
    .appName('HDFS to data') \
    .master('local[1]') \
    .config('spark.hadoop.fs.defaultFS', f'hdfs://172.17.0.23/user/live_project_b/data')\
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x0000025576464790>


# Чистим данные

### Обработка clients

In [20]:
clients = spark.read.option("multiline", "true")\
    .csv(f"hdfs://172.17.0.23/user/live_project_b/data/clients", header=True, inferSchema=True)

In [21]:
from pyspark.sql.functions import regexp_replace

clean_clients = clients.withColumn('client_address', regexp_replace('client_address', '\n', ' '))


In [22]:
clean_clients.show(truncate=False)

+---------+----------------+---------------------------+---------------------+------------------------------------------------------------+
|client_id|client_name     |client_email               |client_phone         |client_address                                              |
+---------+----------------+---------------------------+---------------------+------------------------------------------------------------+
|1        |Collin Wise     |melissa65@example.com      |+1-776-515-1789x521  |713 Bethany Ridges Suite 988 Port Rebekahside, HI 84917     |
|2        |Nicole Perry    |blakebaker@example.net     |(582)838-7980        |6972 Wright Course West Jeffery, CA 97155                   |
|3        |Paul Moore      |blake60@example.net        |299-671-7332x774     |10784 Cindy Plain Apt. 185 Hamiltonton, FL 12024            |
|4        |Rebecca Giles   |deborahanderson@example.com|490.470.0682x97497   |307 Welch Road Suite 782 Lake Brianport, WY 54526           |
|5        |Alexandra

In [24]:
clean_clients.write.format('csv').option('header', 'true').save('clients') 

### Обработка clients_activities

In [27]:
clients_activities = spark.read.option("multiLine", "true")\
    .csv(f"hdfs://172.17.0.23/user/live_project_b/data/clients_activities", header=True, inferSchema=True)

clients_activities.show(truncate=False)


+---------+--------------------------+--------------+----------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------------+
|client_id|activity_date             |activity_type |activity_location     |ip_address    |device                                                                                                                             |
+---------+--------------------------+--------------+----------------------+--------------+-----------------------------------------------------------------------------------------------------------------------------------+
|1        |2024-10-22 05:55:15.712244|pay_bill      |list                  |19.127.116.230|Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_4 rv:4.0; li-NL) AppleWebKit/534.25.4 (KHTML, like Gecko) Version/4.0.5 Safari/534.25.4|
|1        |2024-10-20 22:30:14.712244|view_account  |main/wp-content/search|19.127.116.230|Mozilla/5.0 (

In [28]:
clients_activities.write.format('csv').option('header', 'true').save('clients_activities') 

### Обработка clients_calls_support

In [29]:
clients_calls_support = spark.read.option("multiLine", "true")\
    .csv(f"hdfs://172.17.0.23/user/live_project_b/data/clients_calls_support", header=True, inferSchema=True)

clients_calls_support.show(truncate=False)


+---------+--------------------------+--------+----------+
|client_id|call_date                 |duration|result    |
+---------+--------------------------+--------+----------+
|1        |2024-06-06 05:03:42.712244|169     |resolved  |
|1        |2024-09-06 07:05:58.712244|347     |resolved  |
|1        |2024-07-19 15:11:22.712244|150     |unresolved|
|1        |2024-09-17 11:42:17.712244|542     |resolved  |
|1        |2024-08-17 08:41:03.712244|1414    |unresolved|
|1        |2024-08-09 13:07:49.712244|902     |unresolved|
|1        |2024-10-20 17:54:25.712244|1535    |resolved  |
|1        |2024-10-23 00:46:45.712244|752     |resolved  |
|1        |2024-10-08 12:25:38.712244|1505    |unresolved|
|1        |2024-10-20 09:19:33.712244|382     |resolved  |
|1        |2024-09-28 02:36:12.712244|379     |unresolved|
|1        |2024-07-01 13:03:47.712244|225     |resolved  |
|1        |2024-09-13 20:09:34.712244|313     |unresolved|
|1        |2024-08-01 20:42:10.712244|578     |resolved 

In [30]:
from pyspark.sql.functions import when, col

clean_clients_calls_support = clients_calls_support.withColumn("result", when(col("result") == "resolved", True).otherwise(False))

# Показать измененный DataFrame
clean_clients_calls_support.show()

+---------+--------------------+--------+------+
|client_id|           call_date|duration|result|
+---------+--------------------+--------+------+
|        1|2024-06-06 05:03:...|     169|  true|
|        1|2024-09-06 07:05:...|     347|  true|
|        1|2024-07-19 15:11:...|     150| false|
|        1|2024-09-17 11:42:...|     542|  true|
|        1|2024-08-17 08:41:...|    1414| false|
|        1|2024-08-09 13:07:...|     902| false|
|        1|2024-10-20 17:54:...|    1535|  true|
|        1|2024-10-23 00:46:...|     752|  true|
|        1|2024-10-08 12:25:...|    1505| false|
|        1|2024-10-20 09:19:...|     382|  true|
|        1|2024-09-28 02:36:...|     379| false|
|        1|2024-07-01 13:03:...|     225|  true|
|        1|2024-09-13 20:09:...|     313| false|
|        1|2024-08-01 20:42:...|     578|  true|
|        1|2024-07-25 11:11:...|    1236|  true|
|        1|2024-05-25 18:42:...|    1120|  true|
|        2|2024-02-19 10:07:...|     315| false|
|        2|2024-09-2

In [31]:
clean_clients_calls_support.printSchema()

root
 |-- client_id: integer (nullable = true)
 |-- call_date: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- result: boolean (nullable = false)



In [32]:
clean_clients_calls_support.write.format('csv').option('header', 'true').save('clients_calls_support') 

### Обработка clients_logins

In [33]:
clients_logins = spark.read.option("header", "true")\
    .csv(f"hdfs://172.17.0.23/user/live_project_b/data/clients_logins", header=True, inferSchema=True, multiLine=True)

clients_logins.show(truncate=False)


+---------+--------------------------+---------------+---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|client_id|login_date                |ip_address     |location                               |device                                                                                                                                             |
+---------+--------------------------+---------------+---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|1        |2024-10-02 19:14:31.712244|19.127.116.230 |-82.7035991836656, 110.03683951259939  |Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_4 rv:4.0; li-NL) AppleWebKit/534.25.4 (KHTML, like Gecko) Version/4.0.5 Safari/534.25.4                |
|1        |2024-02-24 09:24:

In [34]:
clients_logins.write.format('csv').option('header', 'true').save('clients_logins') 

### Обработка clients_payments

In [35]:
clients_payments = spark.read.option("header", "true")\
    .csv(f"hdfs://172.17.0.23/user/live_project_b/data/clients_payments", header=True, inferSchema=True, multiLine=True)

clients_payments.show(truncate=False)

+---------+----------+--------------------------+--------+---------+--------------+--------------+
|client_id|payment_id|payment_date              |currency|amount   |payment_method|transaction_id|
+---------+----------+--------------------------+--------+---------+--------------+--------------+
|1        |2240      |2024-10-23 04:56:48.712244|RUB     |583417.77|debit_card    |4190          |
|1        |8307      |2024-10-16 15:37:10.712244|USD     |8161.27  |e_wallet      |5347          |
|1        |3753      |2024-10-25 03:17:03.712244|USD     |2392.96  |bank_transfer |3664          |
|1        |9903      |2024-09-04 23:50:54.712244|RUB     |529688.47|e_wallet      |4761          |
|1        |5669      |2024-06-19 03:21:12.712244|USD     |9974.11  |bank_transfer |3063          |
|1        |9613      |2024-10-04 00:23:48.712244|RUB     |533177.55|bank_transfer |2073          |
|1        |7217      |2024-10-17 09:49:50.712244|RUB     |759007.87|e_wallet      |1094          |
|1        

In [16]:
spark.stop()