In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext()
spark = SparkSession(sc)

### 1. Đọc dữ liệu => data

In [3]:
data = spark.read.csv('complaints.csv',inferSchema=True, header=True)

### 2. Cho biết dữ liệu có bao nhiêu dòng, in schema. Hiện 3 dòng đầu tiên

In [4]:
data.count()

2083368

In [5]:
data.printSchema()

root
 |-- Date received: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Sub-product: string (nullable = true)
 |-- Issue: string (nullable = true)
 |-- Sub-issue: string (nullable = true)
 |-- Consumer complaint narrative: string (nullable = true)
 |-- Company public response: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- State: string (nullable = true)
 |-- ZIP code: string (nullable = true)
 |-- Tags: string (nullable = true)
 |-- Consumer consent provided?: string (nullable = true)
 |-- Submitted via: string (nullable = true)
 |-- Date sent to company: string (nullable = true)
 |-- Company response to consumer: string (nullable = true)
 |-- Timely response?: string (nullable = true)
 |-- Consumer disputed?: string (nullable = true)
 |-- Complaint ID: string (nullable = true)



In [6]:
data.show(3)

+--------------------+--------------------+----------------+--------------------+--------------------+----------------------------+-----------------------+--------------------+-----+----------+--------------------+--------------------------+-------------+--------------------+----------------------------+----------------+------------------+------------+
|       Date received|             Product|     Sub-product|               Issue|           Sub-issue|Consumer complaint narrative|Company public response|             Company|State|  ZIP code|                Tags|Consumer consent provided?|Submitted via|Date sent to company|Company response to consumer|Timely response?|Consumer disputed?|Complaint ID|
+--------------------+--------------------+----------------+--------------------+--------------------+----------------------------+-----------------------+--------------------+-----+----------+--------------------+--------------------------+-------------+--------------------+--------------

### 3. Kiểm tra dữ liệu Nan, null

In [7]:
from pyspark.sql.functions import when, count, col, isnan, isnull

In [8]:
data.select([count(when(isnan(c),c)).alias(c) for c in data.columns]).toPandas().T

Unnamed: 0,0
Date received,0
Product,0
Sub-product,0
Issue,0
Sub-issue,0
Consumer complaint narrative,0
Company public response,0
Company,0
State,0
ZIP code,0


In [9]:
data.select([count(when(isnull(c),c)).alias(c) for c in data.columns]).toPandas().T

Unnamed: 0,0
Date received,40
Product,209258
Sub-product,511176
Issue,325183
Sub-issue,931248
Consumer complaint narrative,1454171
Company public response,1412034
Company,573431
State,597994
ZIP code,673059


### 4. Kiểm tra dữ liệu trùng. Xóa dữ liệu trùng

In [10]:
#dự liệu gốc:
data.count()

2083368

In [11]:
#dữ liệu duy nhất:
data.distinct().count()

2020609

In [12]:
data = data.drop_duplicates()

### 5. Kiểm tra lại dữ liệu null. Tính tỉ lệ %

In [13]:
from pyspark.sql.functions import size,

In [14]:
total_num = data.count()

In [15]:
data.select([count(when(isnull(c),c)).alias(c) for c in data.columns]).toPandas().T

Unnamed: 0,0
Date received,39
Product,178373
Sub-product,472168
Issue,281416
Sub-issue,884470
Consumer complaint narrative,1405173
Company public response,1353196
Company,513337
State,537111
ZIP code,611785


In [17]:
null_table = data.select([(count(when(isnull(c),c))*100/total_num).alias(c) for c in data.columns]).toPandas().T

In [18]:
null_table_table

Unnamed: 0,0
Date received,0.00193
Product,8.827685
Sub-product,23.367608
Issue,13.927286
Sub-issue,43.772447
Consumer complaint narrative,69.542054
Company public response,66.969711
Company,25.405064
State,26.581639
ZIP code,30.277258


### 6. Tạo dữ liệu mới không có các cột thiếu trên 30%

In [43]:
nonull = list(null_table[null_table[0] <30].index)

In [46]:
data = data.select(nonull)

In [47]:
data.columns

['Date received',
 'Product',
 'Sub-product',
 'Issue',
 'Company',
 'State',
 'Consumer consent provided?',
 'Submitted via',
 'Date sent to company']

### 7. Xóa các dòng có 'Date received' là null hoặc Product là null

In [55]:
data = data.filter(~(col('Date received').isNull() | col('Product').isNull()))

### 8. Tạo cột "date_from_text" chứa dữ liệu yyyy-dd-mm từ "Date received" nếu có. Nếu không sẽ là ' '

In [64]:
from pyspark.sql.functions import *
from pyspark.sql.types import DateType
from datetime import datetime

In [65]:
func = udf (lambda x: datetime.strptime(x,'%Y-%M-%d'), DateType())

In [66]:
data.withColumn('date_from_text',func(col('Date received'))).show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "C:\spark\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 604, in main
  File "C:\spark\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 596, in process
  File "C:\spark\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "C:\spark\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "C:\spark\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 200, in _batched
    for item in iterator:
  File "C:\spark\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 450, in mapper
  File "C:\spark\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 450, in <genexpr>
  File "C:\spark\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 83, in <lambda>
  File "C:\spark\spark-3.1.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-65-cbb7f2f51982>", line 1, in <lambda>
  File "c:\users\nguye\appdata\local\programs\python\python38-32\lib\_strptime.py", line 568, in _strptime_datetime
    tt, fraction, gmtoff_fraction = _strptime(data_string, format)
  File "c:\users\nguye\appdata\local\programs\python\python38-32\lib\_strptime.py", line 349, in _strptime
    raise ValueError("time data %r does not match format %r" %
ValueError: time data '****United XXXXXXXX XXXX and XXXX XXXX XXXX dba XXXX XXXX clearly violated Regulation X' does not match format '%Y-%M-%d'
