## 1. candidate_master

In [62]:
import pandas as pd
import json
import requests
import numpy as np
import os

In [63]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Data Cleaning App") \
    .master("local[*]") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.driver.host", "127.0.0.1") \
    .getOrCreate()


In [70]:
spark.stop()

In [15]:
base_folder_path = "D:/Sohail_DE_Project/fec_env/big-data-fec-project/data/raw"
file_name = "candidate_master"
csv_path = os.path.join(base_folder_path, file_name, f"{file_name}.csv")


cand_master_df = spark.read.option('header',True).option('inferSchema','true').csv(csv_path)


In [16]:
cand_master_df.rdd.getNumPartitions()

1

In [17]:
cand_master_df = cand_master_df.repartition(4)

In [18]:
cand_master_df.rdd.getNumPartitions()

4

In [19]:
# datatype check
cand_master_df.printSchema()

root
 |-- CAND_ID: string (nullable = true)
 |-- CAND_NAME: string (nullable = true)
 |-- CAND_PTY_AFFILIATION: string (nullable = true)
 |-- CAND_ELECTION_YR: integer (nullable = true)
 |-- CAND_OFFICE_ST: string (nullable = true)
 |-- CAND_OFFICE: string (nullable = true)
 |-- CAND_OFFICE_DISTRICT: integer (nullable = true)
 |-- CAND_ICI: string (nullable = true)
 |-- CAND_STATUS: string (nullable = true)
 |-- CAND_PCC: string (nullable = true)
 |-- CAND_ST1: string (nullable = true)
 |-- CAND_ST2: string (nullable = true)
 |-- CAND_CITY: string (nullable = true)
 |-- CAND_ST: string (nullable = true)
 |-- CAND_ZIP: double (nullable = true)



In [20]:
# Find Count of Null, None, NaN of All DataFrame Columns
from pyspark.sql.functions import col,isnan, when, count
cand_master_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in cand_master_df.columns]
   ).show()

+-------+---------+--------------------+----------------+--------------+-----------+--------------------+--------+-----------+--------+--------+--------+---------+-------+--------+
|CAND_ID|CAND_NAME|CAND_PTY_AFFILIATION|CAND_ELECTION_YR|CAND_OFFICE_ST|CAND_OFFICE|CAND_OFFICE_DISTRICT|CAND_ICI|CAND_STATUS|CAND_PCC|CAND_ST1|CAND_ST2|CAND_CITY|CAND_ST|CAND_ZIP|
+-------+---------+--------------------+----------------+--------------+-----------+--------------------+--------+-----------+--------+--------+--------+---------+-------+--------+
|      0|        0|                   7|               0|             0|          0|                   7|     322|          0|    1421|     189|    6844|        3|    179|     190|
+-------+---------+--------------------+----------------+--------------+-----------+--------------------+--------+-----------+--------+--------+--------+---------+-------+--------+



In [21]:
# replace null cand_pty_affiliation with NNE(None) code
cand_master_df = cand_master_df.fillna({"CAND_PTY_AFFILIATION": "NNE"})

In [22]:
# CAND_OFFICE_DISTRICT
from pyspark.sql.functions import col, when, upper, lit, regexp_replace
from pyspark.sql.types import IntegerType, StringType

cand_master_df = cand_master_df.withColumn(
    "CAND_OFFICE_DISTRICT",
    when(col("CAND_OFFICE_DISTRICT").isNull(), lit(0.0)).otherwise(col("CAND_OFFICE_DISTRICT")).cast(IntegerType())
)

In [23]:
# CAND_ICI fill with random choice between other three values
uniq_ici_rows = cand_master_df.select("CAND_ICI").distinct().dropna().collect()
uniq_ici_list = [row["CAND_ICI"] for row in uniq_ici_rows]
cand_master_df = cand_master_df.withColumn('CAND_ICI',when(col('CAND_ICI').isNull(),np.random.choice(uniq_ici_list)).otherwise(col('CAND_ICI')))

In [24]:
# CAND_PCC
uniq_pcc_rows = cand_master_df.select("CAND_PCC").distinct().dropna().collect()
uniq_pcc_list = [row["CAND_PCC"] for row in uniq_pcc_rows]

cand_master_df = cand_master_df.withColumn("CAND_PCC",when(col("CAND_PCC").isNull(),np.random.choice(uniq_pcc_list)).otherwise(col("CAND_PCC")))

In [25]:
# 'CAND_CITY' drop  null rows
cand_master_df = cand_master_df.na.drop(subset=["CAND_CITY"])

In [26]:
spark.stop()

In [3]:
cand_master_df = pd.read_csv("D:/DE_Project_FEC/fec_env/big-data-fec-project/data/raw/candidate_master/candidate_master.csv")
cand_master_df.head(2)

Unnamed: 0,CAND_ID,CAND_NAME,CAND_PTY_AFFILIATION,CAND_ELECTION_YR,CAND_OFFICE_ST,CAND_OFFICE,CAND_OFFICE_DISTRICT,CAND_ICI,CAND_STATUS,CAND_PCC,CAND_ST1,CAND_ST2,CAND_CITY,CAND_ST,CAND_ZIP
0,H0AK00105,"LAMB, THOMAS",NNE,2020,AK,H,0.0,C,N,C00607515,1861 W LAKE LUCILLE DR,,WASILLA,AK,99654.0
1,H0AK00113,"TUGATUK, RAY SEAN",DEM,2020,AK,H,0.0,C,N,,PO BOX 172,,MANAKOTAK,AK,99628.0


In [4]:
# datatype check
cand_master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7758 entries, 0 to 7757
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CAND_ID               7758 non-null   object 
 1   CAND_NAME             7758 non-null   object 
 2   CAND_PTY_AFFILIATION  7751 non-null   object 
 3   CAND_ELECTION_YR      7758 non-null   int64  
 4   CAND_OFFICE_ST        7758 non-null   object 
 5   CAND_OFFICE           7758 non-null   object 
 6   CAND_OFFICE_DISTRICT  7751 non-null   float64
 7   CAND_ICI              7436 non-null   object 
 8   CAND_STATUS           7758 non-null   object 
 9   CAND_PCC              6337 non-null   object 
 10  CAND_ST1              7569 non-null   object 
 11  CAND_ST2              914 non-null    object 
 12  CAND_CITY             7755 non-null   object 
 13  CAND_ST               7579 non-null   object 
 14  CAND_ZIP              7568 non-null   float64
dtypes: float64(2), int64(

In [5]:
# null values check
cand_master_df.isnull().sum()

CAND_ID                    0
CAND_NAME                  0
CAND_PTY_AFFILIATION       7
CAND_ELECTION_YR           0
CAND_OFFICE_ST             0
CAND_OFFICE                0
CAND_OFFICE_DISTRICT       7
CAND_ICI                 322
CAND_STATUS                0
CAND_PCC                1421
CAND_ST1                 189
CAND_ST2                6844
CAND_CITY                  3
CAND_ST                  179
CAND_ZIP                 190
dtype: int64

In [181]:
# replace null cand_pty_affiliation with NNE(None) code
cand_master_df['CAND_PTY_AFFILIATION'] = cand_master_df['CAND_PTY_AFFILIATION'].fillna('NNE')

In [None]:
# CAND_OFFICE_DISTRICT
cand_master_df['CAND_OFFICE_DISTRICT'] = cand_master_df['CAND_OFFICE_DISTRICT'].fillna(0.0)
cand_master_df['CAND_OFFICE_DISTRICT'] = cand_master_df['CAND_OFFICE_DISTRICT'].astype(int)

In [183]:
# CAND_ICI fill with random choice between other three values
unique_ici = cand_master_df['CAND_ICI'].dropna().unique()
cand_master_df['CAND_ICI'] = cand_master_df['CAND_ICI'].apply(lambda x: np.random.choice(unique_ici) if pd.isna(x) else x)

In [184]:
# CAND_PCC
uniquer_pcc = cand_master_df['CAND_PCC'].dropna().unique()
cand_master_df['CAND_PCC'] = cand_master_df['CAND_PCC'].apply(lambda x: np.random.choice(uniquer_pcc) if pd.isna(x) else x)

In [185]:
# 'CAND_CITY' drop  null rows
cand_master_df = cand_master_df.dropna(subset=['CAND_CITY'])

In [191]:
# Load local JSON file
with open("C:/Users/mdsoh/Downloads/USCities.json", "r") as f:
    data = json.load(f)

zip_df = pd.DataFrame(data)

# Format city/state to align with your main DataFrame
zip_df['city'] = zip_df['city'].str.upper()
zip_df['state'] = zip_df['state'].str.upper()
zip_df['zip_code'] = zip_df['zip_code'].astype(str).str.zfill(5)

# Drop duplicates so only the first ZIP per city/state is kept
zip_df = zip_df.drop_duplicates(subset=['city', 'state'])

def get_first_zip(city, state):
    state_df = zip_df[(zip_df['state']==state) & (zip_df['city']==city)]
    if state_df.empty:
        return 0
    return state_df['zip_code'].values[0]

cand_master_df['CAND_CITY'] = cand_master_df['CAND_CITY'].str.upper()
cand_master_df['CAND_ST'] = cand_master_df['CAND_ST'].str.upper()

# Normalize and identify null ZIPs
cand_master_df['CAND_ZIP'] = cand_master_df['CAND_ZIP'].astype(str).str.replace('.0', '', regex=False)
cand_master_df.loc[cand_master_df['CAND_ZIP'].isin(['nan', 'None', '','NAN']), 'CAND_ZIP'] = None

# Fill null ZIPs with a random one based on city/state
mask = cand_master_df['CAND_ZIP'].isnull()
cand_master_df.loc[mask, 'CAND_ZIP'] = cand_master_df[mask].apply(
    lambda row: get_first_zip(row['CAND_CITY'], row['CAND_ST']),
    axis=1
)

cand_master_df['CAND_ZIP'] = cand_master_df['CAND_ZIP'].astype(int)

In [192]:
cand_master_df.shape

(7755, 15)

## 2. committee master

In [254]:
comm_master_df = pd.read_csv("D:/DE_Project_FEC/fec_env/big-data-fec-project/data/raw/committe_master/committe_master.csv")
comm_master_df.head(2)

Unnamed: 0,CMTE_ID,CMTE_NM,TRES_NM,CMTE_ST1,CMTE_ST2,CMTE_CITY,CMTE_ST,CMTE_ZIP,CMTE_DSGN,CMTE_TP,CMTE_PTY_AFFILIATION,CMTE_FILING_FREQ,ORG_TP,CONNECTED_ORG_NM,CAND_ID
0,C00000059,HALLMARK CARDS PAC,SARAH MOE,2501 MCGEE,MD #500,KANSAS CITY,MO,64108,U,Q,UNK,M,C,,
1,C00000422,AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...,"WALKER, KEVIN MR.","25 MASSACHUSETTS AVE, NW",SUITE 600,WASHINGTON,DC,200017400,B,Q,,M,,DELAWARE MEDICAL PAC,


In [255]:
# data type check
comm_master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18286 entries, 0 to 18285
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   CMTE_ID               18286 non-null  object
 1   CMTE_NM               18284 non-null  object
 2   TRES_NM               17557 non-null  object
 3   CMTE_ST1              18277 non-null  object
 4   CMTE_ST2              4519 non-null   object
 5   CMTE_CITY             18277 non-null  object
 6   CMTE_ST               18278 non-null  object
 7   CMTE_ZIP              18273 non-null  object
 8   CMTE_DSGN             18281 non-null  object
 9   CMTE_TP               18281 non-null  object
 10  CMTE_PTY_AFFILIATION  7796 non-null   object
 11  CMTE_FILING_FREQ      18286 non-null  object
 12  ORG_TP                3253 non-null   object
 13  CONNECTED_ORG_NM      9273 non-null   object
 14  CAND_ID               6718 non-null   object
dtypes: object(15)
memory usage: 2.1+ MB


In [260]:
comm_master_df.isnull().sum()

CMTE_ID                     0
CMTE_NM                     0
TRES_NM                     0
CMTE_ST1                    1
CMTE_ST2                13759
CMTE_CITY                   0
CMTE_ST                     0
CMTE_ZIP                    4
CMTE_DSGN                   2
CMTE_TP                     2
CMTE_PTY_AFFILIATION        0
CMTE_FILING_FREQ            0
ORG_TP                      0
CONNECTED_ORG_NM            0
CAND_ID                 11562
dtype: int64

In [257]:
# treasured name not found
comm_master_df['TRES_NM'] = comm_master_df['TRES_NM'].fillna('Not Mention')
comm_master_df['CMTE_PTY_AFFILIATION'] = comm_master_df['CMTE_PTY_AFFILIATION'].fillna('NNE')
comm_master_df['ORG_TP'] = comm_master_df['ORG_TP'].fillna('N')
comm_master_df['CONNECTED_ORG_NM'] = comm_master_df['CONNECTED_ORG_NM'].fillna('NONE')

In [258]:
# 'CMTE_NM','CMTE_CITY' drop  null rows
comm_master_df = comm_master_df.dropna(subset=['CMTE_NM','CMTE_CITY'])

In [259]:
comm_master_df.shape

(18277, 15)

## 3. candidate committee linkage

In [261]:
cand_comm_df = pd.read_csv("D:/DE_Project_FEC/fec_env/big-data-fec-project/data/raw/candidate_committtee_linkage/candidate_committtee_linkage.csv")
cand_comm_df.head(2)

Unnamed: 0,CAND_ID,CAND_ELECTION_YR,FEC_ELECTION_YR,CMTE_ID,CMTE_TP,CMTE_DSGN,LINKAGE_ID
0,C00713602,2019,2020,C00712851,O,U,228963
1,H0AK00105,2020,2020,C00607515,H,P,229250


In [262]:
# data type check
cand_comm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7055 entries, 0 to 7054
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   CAND_ID           7055 non-null   object
 1   CAND_ELECTION_YR  7055 non-null   int64 
 2   FEC_ELECTION_YR   7055 non-null   int64 
 3   CMTE_ID           7055 non-null   object
 4   CMTE_TP           7055 non-null   object
 5   CMTE_DSGN         7055 non-null   object
 6   LINKAGE_ID        7055 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 385.9+ KB


In [263]:
# null check
cand_comm_df.isnull().sum()

CAND_ID             0
CAND_ELECTION_YR    0
FEC_ELECTION_YR     0
CMTE_ID             0
CMTE_TP             0
CMTE_DSGN           0
LINKAGE_ID          0
dtype: int64

## 4. operating expenditure

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType, LongType, IntegerType
schedule_schema = StructType([
    StructField("CMTE_ID", StringType(), False),               # VARCHAR2(9)
    StructField("AMNDT_IND", StringType(), True),             # VARCHAR2(1)
    StructField("RPT_YR", IntegerType(), True),               # Number(4)
    StructField("RPT_TP", StringType(), True),                # VARCHAR2(3)
    StructField("IMAGE_NUM", StringType(), True),             # VARCHAR2(11/18)
    StructField("LINE_NUM", StringType(), True),
    StructField("FORM_TP_CD", StringType(), True),
    StructField("SCHED_TP_CD", StringType(), True),
    StructField("NAME", StringType(), True),
    StructField("CITY", StringType(), True),
    StructField("STATE", StringType(), True),
    StructField("ZIP_CODE", StringType(), True),
    StructField("TRANSACTION_DT", DateType(), True),          # DATE
    StructField("TRANSACTION_AMT", DoubleType(), True),       # NUMBER(14,2)
    StructField("TRANSACTION_PGI", StringType(), True),       # VARCHAR2(5)
    StructField("PURPOSE", StringType(), True),               # VARCHAR2(100)
    StructField("CATEGORY", StringType(), True),              # VARCHAR2(3)
    StructField("CATEGORY_DESC", StringType(), True),         # VARCHAR2(40)
    StructField("MEMO_CD", StringType(), True),               # VARCHAR2(1)
    StructField("MEMO_TEXT", StringType(), True),             # VARCHAR2(100)
    StructField("ENTITY_TP", StringType(), True),             # VARCHAR2(3)    
    StructField("SUB_ID", LongType(), False),                  # NUMBER(19)
    StructField("FILE_NUM", IntegerType(), True),             # NUMBER(7)
    StructField("TRAN_ID", StringType(), True),               # VARCHAR2(32)
    StructField("BACK_REF_TRAN_ID", StringType(), True)       # VARCHAR2(32)
])

In [66]:
oper_exp_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("delimiter", "|") \
    .option("badRecordsPath", "D:/Sohail_DE_Project/fec_env/big-data-fec-project/data/raw/operating_expense/badrecords.csv") \
    .schema(schedule_schema) \
    .load("D:/Sohail_DE_Project/fec_env/big-data-fec-project/data/raw/operating_expense/oppexp.txt")

In [67]:
oper_exp_df.show(5)

+---------+---------+------+------+------------------+--------+----------+-----------+--------------------+-------------+-----+---------+--------------+---------------+---------------+--------------------+--------+--------------------+-------+---------+---------+-------------------+--------+-----------+----------------+
|  CMTE_ID|AMNDT_IND|RPT_YR|RPT_TP|         IMAGE_NUM|LINE_NUM|FORM_TP_CD|SCHED_TP_CD|                NAME|         CITY|STATE| ZIP_CODE|TRANSACTION_DT|TRANSACTION_AMT|TRANSACTION_PGI|             PURPOSE|CATEGORY|       CATEGORY_DESC|MEMO_CD|MEMO_TEXT|ENTITY_TP|             SUB_ID|FILE_NUM|    TRAN_ID|BACK_REF_TRAN_ID|
+---------+---------+------+------+------------------+--------+----------+-----------+--------------------+-------------+-----+---------+--------------+---------------+---------------+--------------------+--------+--------------------+-------+---------+---------+-------------------+--------+-----------+----------------+
|C00639872|        T|  2019|   TER

In [68]:
oper_exp_df.rdd.getNumPartitions()

20

In [69]:
oper_exp_df.count()

2310524

In [3]:
#oper_exp_df = pd.read_csv("D:/DE_Project_FEC/fec_env/big-data-fec-project/data/raw/operating_expenditure/oppexp.txt")
oper_exp_df = pd.read_csv("D:/DE_Project_FEC/fec_env/big-data-fec-project/data/raw/operating_expenditure/oppexp.txt", delimiter="|",on_bad_lines='skip')
oper_exp_df.head(2)

  oper_exp_df = pd.read_csv("D:/DE_Project_FEC/fec_env/big-data-fec-project/data/raw/operating_expenditure/oppexp.txt", delimiter="|",on_bad_lines='skip')


Unnamed: 0,CMTE_ID,AMNDT_IND,RPT_YR,RPT_TP,IMAGE_NUM,LINE_NUM,FORM_TP_CD,SCHED_TP_CD,NAME,CITY,...,CATEGORY,CATEGORY_DESC,MEMO_CD,MEMO_TEXT,ENTITY_TP,SUB_ID,FILE_NUM,TRAN_ID,BACK_REF_TRAN_ID,Unnamed: 25
0,C00639872,T,2019,TER,201901209143894009,17,F3,SB,ADMINISTRATIVE BUSINESS SERVICES,CHARLOTTE,...,,,,,ORG,4021520191639587673,1305744,VTPY09W3DX1,,
1,C00639872,T,2019,TER,201901209143894009,17,F3,SB,ADP INC,ROSELAND,...,,,,,ORG,4021520191639587675,1305744,VTPY09W2JW7,,


In [4]:
oper_exp_df.shape

(2310524, 26)

In [8]:
# data type check
oper_exp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2310524 entries, 0 to 2310523
Data columns (total 26 columns):
 #   Column            Dtype  
---  ------            -----  
 0   CMTE_ID           object 
 1   AMNDT_IND         object 
 2   RPT_YR            int64  
 3   RPT_TP            object 
 4   IMAGE_NUM         int64  
 5   LINE_NUM          object 
 6   FORM_TP_CD        object 
 7   SCHED_TP_CD       object 
 8   NAME              object 
 9   CITY              object 
 10  STATE             object 
 11  ZIP_CODE          object 
 12  TRANSACTION_DT    object 
 13  TRANSACTION_AMT   float64
 14  TRANSACTION_PGI   object 
 15  PURPOSE           object 
 16  CATEGORY          object 
 17  CATEGORY_DESC     object 
 18  MEMO_CD           object 
 19  MEMO_TEXT         object 
 20  ENTITY_TP         object 
 21  SUB_ID            int64  
 22  FILE_NUM          int64  
 23  TRAN_ID           object 
 24  BACK_REF_TRAN_ID  object 
 25  Unnamed: 25       float64
dtypes: float64(2),

In [9]:
# null check 
oper_exp_df.isnull().sum()

CMTE_ID                   0
AMNDT_IND                 0
RPT_YR                    0
RPT_TP                    0
IMAGE_NUM                 0
LINE_NUM                  0
FORM_TP_CD                0
SCHED_TP_CD               0
NAME                    158
CITY                   6255
STATE                  6463
ZIP_CODE               7803
TRANSACTION_DT           15
TRANSACTION_AMT           0
TRANSACTION_PGI      924380
PURPOSE               10154
CATEGORY            1800724
CATEGORY_DESC       1808169
MEMO_CD             1524383
MEMO_TEXT           1718434
ENTITY_TP              4593
SUB_ID                    0
FILE_NUM                  0
TRAN_ID                   0
BACK_REF_TRAN_ID    1619641
Unnamed: 25         2310524
dtype: int64

In [10]:
# Schedule type
oper_exp_df['SCHED_TP_CD'] = oper_exp_df['SCHED_TP_CD'].fillna('Not Mention')

In [11]:
# Contributor/Lender/Transfer Name
oper_exp_df['NAME'] = oper_exp_df['NAME'].fillna('Not Mention')

In [12]:
#  TRANSACTION_DT
oper_exp_df['TRANSACTION_DT'].value_counts()

TRANSACTION_DT
02/14/2020    17462
01/31/2020    16982
02/28/2020    14728
09/30/2020    13754
03/13/2020    12150
              ...  
01/06/2018        1
03/20/2015        1
01/15/2015        1
10/05/2022        1
12/15/2021        1
Name: count, Length: 1134, dtype: int64

## 5. individual contribution

In [None]:
import pandas as pd

# File path
file_path = "D:/DE_Project_FEC/fec_env/big-data-fec-project/data/raw/individual_contribution/itcont.txt"

# Load headers from Excel
headers_excel_path = "D:/DE_Project_FEC/fec_env/big-data-fec-project/data/raw/individual_contribution/indiv_header_file.csv"
headers = pd.read_csv(headers_excel_path).columns.tolist()

# Read the txt file (pipe-delimited) without headers
df = pd.read_csv(file_path, delimiter="|", header=None, names=headers,on_bad_lines='skip')

# Sample 10% of the rows
sample_df = df.sample(frac=0.1, random_state=42)

# Show sample
sample_df.head(2)


In [None]:

# Define paths
txt_file_path = "D:/DE_Project_FEC/fec_env/big-data-fec-project/data/raw/individual_contribution/itcont.txt"
headers_csv_path = "D:/DE_Project_FEC/fec_env/big-data-fec-project/data/raw/individual_contribution/indiv_header_file.csv"

# Load headers from CSV using pandas (Spark cannot directly extract just headers from a CSV)
import pandas as pd
headers = [
    "CMTE_ID",
    "AMNDT_IND",
    "RPT_TP",
    "TRANSACTION_PGI",
    "IMAGE_NUM",
    "TRANSACTION_TP",
    "ENTITY_TP",
    "NAME",
    "CITY",
    "STATE",
    "ZIP_CODE",
    "EMPLOYER",
    "OCCUPATION",
    "TRANSACTION_DT",
    "TRANSACTION_AMT",
    "OTHER_ID",
    "TRAN_ID",
    "FILE_NUM",
    "MEMO_CD",
    "MEMO_TEXT",
    "SUB_ID"
]

# Read the text file using Spark
df = spark.read.option("delimiter", "|") \
    .option("header", "false") \
    .csv(txt_file_path)

# Assign headers manually
df = df.toDF(*headers)

# Sample 10% of the rows with a fixed random seed
sample_df = df.sample(withReplacement=False, fraction=0.1, seed=42)

# Show sample
sample_df.show(2)


## 6. PAC Summary

In [37]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType

pac_summary_schema = StructType([
    StructField("CMTE_ID", StringType(), True),
    StructField("CMTE_NM", StringType(), True),
    StructField("CMTE_TP", StringType(), True),
    StructField("CMTE_DSGN", StringType(), True),
    StructField("CMTE_FILING_FREQ", StringType(), True),
    StructField("TTL_RECEIPTS", DoubleType(), True),
    StructField("TRANS_FROM_AFF", DoubleType(), True),
    StructField("INDV_CONTRIB", DoubleType(), True),
    StructField("OTHER_POL_CMTE_CONTRIB", DoubleType(), True),
    StructField("CAND_CONTRIB", DoubleType(), True),
    StructField("CAND_LOANS", DoubleType(), True),
    StructField("TTL_LOANS_RECEIVED", DoubleType(), True),
    StructField("TTL_DISB", DoubleType(), True),
    StructField("TRANF_TO_AFF", DoubleType(), True),
    StructField("INDV_REFUNDS", DoubleType(), True),
    StructField("OTHER_POL_CMTE_REFUNDS", DoubleType(), True),
    StructField("CAND_LOAN_REPAY", DoubleType(), True),
    StructField("LOAN_REPAY", DoubleType(), True),
    StructField("COH_BOP", DoubleType(), True),
    StructField("COH_COP", DoubleType(), True),
    StructField("DEBTS_OWED_BY", DoubleType(), True),
    StructField("NONFED_TRANS_RECEIVED", DoubleType(), True),
    StructField("CONTRIB_TO_OTHER_CMTE", DoubleType(), True),
    StructField("IND_EXP", DoubleType(), True),
    StructField("PTY_COORD_EXP", DoubleType(), True),
    StructField("NONFED_SHARE_EXP", DoubleType(), True),
    StructField("CVG_END_DT", StringType(), True)  # Use StringType initially, can convert to DateType later
])


In [None]:
pac_summ_df = spark.read.format('csv') \
    .option('header', 'true') \
    .option('delimiter', '|') \
    .schema(pac_summary_schema) \
    .load("D:/Sohail_DE_Project/fec_env/big-data-fec-project/data/raw/pac_summary/webk20.txt")


In [41]:
pac_summ_df.printSchema()

root
 |-- CMTE_ID: string (nullable = true)
 |-- CMTE_NM: string (nullable = true)
 |-- CMTE_TP: string (nullable = true)
 |-- CMTE_DSGN: string (nullable = true)
 |-- CMTE_FILING_FREQ: string (nullable = true)
 |-- TTL_RECEIPTS: double (nullable = true)
 |-- TRANS_FROM_AFF: double (nullable = true)
 |-- INDV_CONTRIB: double (nullable = true)
 |-- OTHER_POL_CMTE_CONTRIB: double (nullable = true)
 |-- CAND_CONTRIB: double (nullable = true)
 |-- CAND_LOANS: double (nullable = true)
 |-- TTL_LOANS_RECEIVED: double (nullable = true)
 |-- TTL_DISB: double (nullable = true)
 |-- TRANF_TO_AFF: double (nullable = true)
 |-- INDV_REFUNDS: double (nullable = true)
 |-- OTHER_POL_CMTE_REFUNDS: double (nullable = true)
 |-- CAND_LOAN_REPAY: double (nullable = true)
 |-- LOAN_REPAY: double (nullable = true)
 |-- COH_BOP: double (nullable = true)
 |-- COH_COP: double (nullable = true)
 |-- DEBTS_OWED_BY: double (nullable = true)
 |-- NONFED_TRANS_RECEIVED: double (nullable = true)
 |-- CONTRIB_TO_OT

In [40]:
from pyspark.sql.functions import to_date

pac_summ_df = pac_summ_df.withColumn("CVG_END_DT", to_date("CVG_END_DT", "MM/dd/yyyy"))


In [60]:
# Find Count of Null, None, NaN of All DataFrame Columns
from pyspark.sql.functions import col,isnan, when, count
pac_summ_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in pac_summ_df.columns]
   ).show()

AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "isnan(CVG_END_DT)" due to data type mismatch: Parameter 1 requires the ("DOUBLE" or "FLOAT") type, however "CVG_END_DT" has the type "DATE".;
'Aggregate [count(CASE WHEN (isnan(cast(CMTE_ID#1243 as double)) OR isnull(CMTE_ID#1243)) THEN CMTE_ID END) AS CMTE_ID#2305L, count(CASE WHEN (isnan(cast(CMTE_NM#1244 as double)) OR isnull(CMTE_NM#1244)) THEN CMTE_NM END) AS CMTE_NM#2307L, count(CASE WHEN (isnan(cast(CMTE_TP#1245 as double)) OR isnull(CMTE_TP#1245)) THEN CMTE_TP END) AS CMTE_TP#2309L, count(CASE WHEN (isnan(cast(CMTE_DSGN#1246 as double)) OR isnull(CMTE_DSGN#1246)) THEN CMTE_DSGN END) AS CMTE_DSGN#2311L, count(CASE WHEN (isnan(cast(CMTE_FILING_FREQ#1247 as double)) OR isnull(CMTE_FILING_FREQ#1247)) THEN CMTE_FILING_FREQ END) AS CMTE_FILING_FREQ#2313L, count(CASE WHEN (isnan(TTL_RECEIPTS#1248) OR isnull(TTL_RECEIPTS#1248)) THEN TTL_RECEIPTS END) AS TTL_RECEIPTS#2315L, count(CASE WHEN (isnan(TRANS_FROM_AFF#1249) OR isnull(TRANS_FROM_AFF#1249)) THEN TRANS_FROM_AFF END) AS TRANS_FROM_AFF#2317L, count(CASE WHEN (isnan(INDV_CONTRIB#1250) OR isnull(INDV_CONTRIB#1250)) THEN INDV_CONTRIB END) AS INDV_CONTRIB#2319L, count(CASE WHEN (isnan(OTHER_POL_CMTE_CONTRIB#1251) OR isnull(OTHER_POL_CMTE_CONTRIB#1251)) THEN OTHER_POL_CMTE_CONTRIB END) AS OTHER_POL_CMTE_CONTRIB#2321L, count(CASE WHEN (isnan(CAND_CONTRIB#1252) OR isnull(CAND_CONTRIB#1252)) THEN CAND_CONTRIB END) AS CAND_CONTRIB#2323L, count(CASE WHEN (isnan(CAND_LOANS#1253) OR isnull(CAND_LOANS#1253)) THEN CAND_LOANS END) AS CAND_LOANS#2325L, count(CASE WHEN (isnan(TTL_LOANS_RECEIVED#1254) OR isnull(TTL_LOANS_RECEIVED#1254)) THEN TTL_LOANS_RECEIVED END) AS TTL_LOANS_RECEIVED#2327L, count(CASE WHEN (isnan(TTL_DISB#1255) OR isnull(TTL_DISB#1255)) THEN TTL_DISB END) AS TTL_DISB#2329L, count(CASE WHEN (isnan(TRANF_TO_AFF#1256) OR isnull(TRANF_TO_AFF#1256)) THEN TRANF_TO_AFF END) AS TRANF_TO_AFF#2331L, count(CASE WHEN (isnan(INDV_REFUNDS#1257) OR isnull(INDV_REFUNDS#1257)) THEN INDV_REFUNDS END) AS INDV_REFUNDS#2333L, count(CASE WHEN (isnan(OTHER_POL_CMTE_REFUNDS#1258) OR isnull(OTHER_POL_CMTE_REFUNDS#1258)) THEN OTHER_POL_CMTE_REFUNDS END) AS OTHER_POL_CMTE_REFUNDS#2335L, count(CASE WHEN (isnan(CAND_LOAN_REPAY#1259) OR isnull(CAND_LOAN_REPAY#1259)) THEN CAND_LOAN_REPAY END) AS CAND_LOAN_REPAY#2337L, count(CASE WHEN (isnan(LOAN_REPAY#1260) OR isnull(LOAN_REPAY#1260)) THEN LOAN_REPAY END) AS LOAN_REPAY#2339L, count(CASE WHEN (isnan(COH_BOP#1261) OR isnull(COH_BOP#1261)) THEN COH_BOP END) AS COH_BOP#2341L, count(CASE WHEN (isnan(COH_COP#1262) OR isnull(COH_COP#1262)) THEN COH_COP END) AS COH_COP#2343L, count(CASE WHEN (isnan(DEBTS_OWED_BY#1263) OR isnull(DEBTS_OWED_BY#1263)) THEN DEBTS_OWED_BY END) AS DEBTS_OWED_BY#2345L, count(CASE WHEN (isnan(NONFED_TRANS_RECEIVED#1264) OR isnull(NONFED_TRANS_RECEIVED#1264)) THEN NONFED_TRANS_RECEIVED END) AS NONFED_TRANS_RECEIVED#2347L, count(CASE WHEN (isnan(CONTRIB_TO_OTHER_CMTE#1265) OR isnull(CONTRIB_TO_OTHER_CMTE#1265)) THEN CONTRIB_TO_OTHER_CMTE END) AS CONTRIB_TO_OTHER_CMTE#2349L, count(CASE WHEN (isnan(IND_EXP#1266) OR isnull(IND_EXP#1266)) THEN IND_EXP END) AS IND_EXP#2351L, ... 3 more fields]
+- Project [CMTE_ID#1243, CMTE_NM#1244, CMTE_TP#1245, CMTE_DSGN#1246, CMTE_FILING_FREQ#1247, TTL_RECEIPTS#1248, TRANS_FROM_AFF#1249, INDV_CONTRIB#1250, OTHER_POL_CMTE_CONTRIB#1251, CAND_CONTRIB#1252, CAND_LOANS#1253, TTL_LOANS_RECEIVED#1254, TTL_DISB#1255, TRANF_TO_AFF#1256, INDV_REFUNDS#1257, OTHER_POL_CMTE_REFUNDS#1258, CAND_LOAN_REPAY#1259, LOAN_REPAY#1260, COH_BOP#1261, COH_COP#1262, DEBTS_OWED_BY#1263, NONFED_TRANS_RECEIVED#1264, CONTRIB_TO_OTHER_CMTE#1265, IND_EXP#1266, ... 3 more fields]
   +- Relation [CMTE_ID#1243,CMTE_NM#1244,CMTE_TP#1245,CMTE_DSGN#1246,CMTE_FILING_FREQ#1247,TTL_RECEIPTS#1248,TRANS_FROM_AFF#1249,INDV_CONTRIB#1250,OTHER_POL_CMTE_CONTRIB#1251,CAND_CONTRIB#1252,CAND_LOANS#1253,TTL_LOANS_RECEIVED#1254,TTL_DISB#1255,TRANF_TO_AFF#1256,INDV_REFUNDS#1257,OTHER_POL_CMTE_REFUNDS#1258,CAND_LOAN_REPAY#1259,LOAN_REPAY#1260,COH_BOP#1261,COH_COP#1262,DEBTS_OWED_BY#1263,NONFED_TRANS_RECEIVED#1264,CONTRIB_TO_OTHER_CMTE#1265,IND_EXP#1266,... 3 more fields] csv


In [61]:
spark.stop()