In [1]:
import pandas as pd
import json
import requests
import numpy as np
import os

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Data Cleaning App") \
    .master("local[*]") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.driver.host", "127.0.0.1") \
    .getOrCreate()


In [3]:
spark

In [None]:
spark.stop()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("FECAnalytics").getOrCreate()

# Load datasets with correct column names
candidate_master = spark.read.csv("path/to/candidate_master.csv", header=True, inferSchema=True)
committee_master = spark.read.csv("path/to/committee_master.csv", header=True, inferSchema=True)
committee_trans = spark.read.csv("path/to/committee_to_committee_trans.csv", header=True, inferSchema=True)
operating_exp = spark.read.csv("path/to/operating_expenditures.csv", header=True, inferSchema=True)
cand_cmte_link = spark.read.csv("path/to/candidate_committee_linkage.csv", header=True, inferSchema=True)
pac_summary = spark.read.csv("path/to/pac_summary.csv", header=True, inferSchema=True)
individual_contrib = spark.read.csv("path/to/Individual_contribution.csv", header=True, inferSchema=True)

In [19]:
# reading candidate master
cand_master_schema = StructType([
            StructField("CAND_ID", StringType(), False),           # Not nullable
            StructField("CAND_NAME", StringType(), True),
            StructField("CAND_PTY_AFFILIATION", StringType(), True),
            StructField("CAND_ELECTION_YR", IntegerType(), True),
            StructField("CAND_OFFICE_ST", StringType(), True),
            StructField("CAND_OFFICE", StringType(), True),
            StructField("CAND_OFFICE_DISTRICT", StringType(), True),
            StructField("CAND_ICI", StringType(), True),
            StructField("CAND_STATUS", StringType(), True),
            StructField("CAND_PCC", StringType(), True),
            StructField("CAND_ST1", StringType(), True),
            StructField("CAND_ST2", StringType(), True),
            StructField("CAND_CITY", StringType(), True),
            StructField("CAND_ST", StringType(), True),
            StructField("CAND_ZIP", StringType(), True)
                ])

candidate_master = spark.read.format('csv')\
                .option('header','true')\
                .option("delimiter","|")\
                .schema(cand_master_schema)\
                .load("D:/Sohail_DE_Project/fec_env/big-data-fec-project/data/raw/candidate_master/cn.txt")

In [24]:
# read committe master

comm_master_schema = StructType([
    StructField("CMTE_ID", StringType(), False),                # Not nullable
    StructField("CMTE_NM", StringType(), True),
    StructField("TRES_NM", StringType(), True),
    StructField("CMTE_ST1", StringType(), True),
    StructField("CMTE_ST2", StringType(), True),
    StructField("CMTE_CITY", StringType(), True),
    StructField("CMTE_ST", StringType(), True),
    StructField("CMTE_ZIP", StringType(), True),
    StructField("CMTE_DSGN", StringType(), True),
    StructField("CMTE_TP", StringType(), True),
    StructField("CMTE_PTY_AFFILIATION", StringType(), True),
    StructField("CMTE_FILING_FREQ", StringType(), True),
    StructField("ORG_TP", StringType(), True),
    StructField("CONNECTED_ORG_NM", StringType(), True),
    StructField("CAND_ID", StringType(), True)
])

committee_master = spark.read.format('csv')\
                .option('header','true')\
                .option("delimiter","|")\
                .schema(cand_master_schema)\
                .load("D:/Sohail_DE_Project/fec_env/big-data-fec-project/data/raw/committe_master/cm.txt")


In [25]:
committee_master.show(5)

+---------+--------------------+--------------------+----------------+--------------+-------------+--------------------+---------+-----------+--------+--------+--------+---------+--------------------+--------+
|  CAND_ID|           CAND_NAME|CAND_PTY_AFFILIATION|CAND_ELECTION_YR|CAND_OFFICE_ST|  CAND_OFFICE|CAND_OFFICE_DISTRICT| CAND_ICI|CAND_STATUS|CAND_PCC|CAND_ST1|CAND_ST2|CAND_CITY|             CAND_ST|CAND_ZIP|
+---------+--------------------+--------------------+----------------+--------------+-------------+--------------------+---------+-----------+--------+--------+--------+---------+--------------------+--------+
|C00000059|  HALLMARK CARDS PAC|           SARAH MOE|            NULL|       MD #500|  KANSAS CITY|                  MO|    64108|          U|       Q|     UNK|       M|        C|                NULL|    NULL|
|C00000422|AMERICAN MEDICAL ...|   WALKER, KEVIN MR.|            NULL|     SUITE 600|   WASHINGTON|                  DC|200017400|          B|       Q|    NULL|

In [28]:
# reading committee transaction
comm_trans_schema = StructType([
    StructField("CMTE_ID", StringType(), False),               # Not nullable
    StructField("AMNDT_IND", StringType(), True),
    StructField("RPT_TP", StringType(), True),
    StructField("TRANSACTION_PGI", StringType(), True),
    StructField("IMAGE_NUM", StringType(), True),              # VARCHAR2(11) or (18) → String
    StructField("TRANSACTION_TP", StringType(), True),
    StructField("ENTITY_TP", StringType(), True),
    StructField("NAME", StringType(), True),
    StructField("CITY", StringType(), True),
    StructField("STATE", StringType(), True),
    StructField("ZIP_CODE", StringType(), True),
    StructField("EMPLOYER", StringType(), True),
    StructField("OCCUPATION", StringType(), True),
    StructField("TRANSACTION_DT", DateType(), True),           # MMDDYYYY - will need date parsing
    StructField("TRANSACTION_AMT", DoubleType(), True),        # NUMBER(14,2)
    StructField("OTHER_ID", StringType(), True),
    StructField("TRAN_ID", StringType(), True),
    StructField("FILE_NUM", LongType(), True),                 # NUMBER(22) → Long
    StructField("MEMO_CD", StringType(), True),
    StructField("MEMO_TEXT", StringType(), True),
    StructField("SUB_ID", LongType(), False)                   # Not nullable
])

committee_trans = spark.read.format('csv')\
                .option('header','true')\
                .option("delimiter","|")\
                .schema(comm_trans_schema)\
                .load("D:/Sohail_DE_Project/fec_env/big-data-fec-project/data/raw/committee_to_committee_transaction/itoth.txt")

In [29]:
committee_trans.show(5)

+---------+---------+------+---------------+------------------+--------------+---------+--------------------+----------+-----+---------+--------------------+----------+--------------+---------------+--------+--------------------+--------+-------+--------------------+-------------------+
|  CMTE_ID|AMNDT_IND|RPT_TP|TRANSACTION_PGI|         IMAGE_NUM|TRANSACTION_TP|ENTITY_TP|                NAME|      CITY|STATE| ZIP_CODE|            EMPLOYER|OCCUPATION|TRANSACTION_DT|TRANSACTION_AMT|OTHER_ID|             TRAN_ID|FILE_NUM|MEMO_CD|           MEMO_TEXT|             SUB_ID|
+---------+---------+------+---------------+------------------+--------------+---------+--------------------+----------+-----+---------+--------------------+----------+--------------+---------------+--------+--------------------+--------+-------+--------------------+-------------------+
|C00161810|        A|    M3|              P|202004209219753657|           10J|      ORG|THE CHICKASAW NATION|       ADA|   OK|748209255|

In [30]:
# reading operating expense
oper_exp_schema = StructType([
    StructField("CMTE_ID", StringType(), False),               # VARCHAR2(9)
    StructField("AMNDT_IND", StringType(), True),             # VARCHAR2(1)
    StructField("RPT_YR", IntegerType(), True),               # Number(4)
    StructField("RPT_TP", StringType(), True),                # VARCHAR2(3)
    StructField("IMAGE_NUM", StringType(), True),             # VARCHAR2(11/18)
    StructField("LINE_NUM", StringType(), True),
    StructField("FORM_TP_CD", StringType(), True),
    StructField("SCHED_TP_CD", StringType(), True),
    StructField("NAME", StringType(), True),
    StructField("CITY", StringType(), True),
    StructField("STATE", StringType(), True),
    StructField("ZIP_CODE", StringType(), True),
    StructField("TRANSACTION_DT", DateType(), True),          # DATE
    StructField("TRANSACTION_AMT", DoubleType(), True),       # NUMBER(14,2)
    StructField("TRANSACTION_PGI", StringType(), True),       # VARCHAR2(5)
    StructField("PURPOSE", StringType(), True),               # VARCHAR2(100)
    StructField("CATEGORY", StringType(), True),              # VARCHAR2(3)
    StructField("CATEGORY_DESC", StringType(), True),         # VARCHAR2(40)
    StructField("MEMO_CD", StringType(), True),               # VARCHAR2(1)
    StructField("MEMO_TEXT", StringType(), True),             # VARCHAR2(100)
    StructField("ENTITY_TP", StringType(), True),             # VARCHAR2(3)    
    StructField("SUB_ID", LongType(), False),                  # NUMBER(19)
    StructField("FILE_NUM", IntegerType(), True),             # NUMBER(7)
    StructField("TRAN_ID", StringType(), True),               # VARCHAR2(32)
    StructField("BACK_REF_TRAN_ID", StringType(), True)       # VARCHAR2(32)
])

operating_exp = spark.read.format('csv')\
                .option('header','true')\
                .option("delimiter","|")\
                .schema(oper_exp_schema)\
                .load("D:/Sohail_DE_Project/fec_env/big-data-fec-project/data/raw/operating_expense/oppexp.txt")

In [31]:
operating_exp.show(5)

+---------+---------+------+------+------------------+--------+----------+-----------+--------------------+-------------+-----+---------+--------------+---------------+---------------+--------------------+--------+--------------------+-------+---------+---------+-------------------+--------+-----------+----------------+
|  CMTE_ID|AMNDT_IND|RPT_YR|RPT_TP|         IMAGE_NUM|LINE_NUM|FORM_TP_CD|SCHED_TP_CD|                NAME|         CITY|STATE| ZIP_CODE|TRANSACTION_DT|TRANSACTION_AMT|TRANSACTION_PGI|             PURPOSE|CATEGORY|       CATEGORY_DESC|MEMO_CD|MEMO_TEXT|ENTITY_TP|             SUB_ID|FILE_NUM|    TRAN_ID|BACK_REF_TRAN_ID|
+---------+---------+------+------+------------------+--------+----------+-----------+--------------------+-------------+-----+---------+--------------+---------------+---------------+--------------------+--------+--------------------+-------+---------+---------+-------------------+--------+-----------+----------------+
|C00639872|        T|  2019|   TER

In [38]:
# reading candidate committee linkage
cand_cmte_link_schema = StructType([
    StructField("CAND_ID",StringType(),False),
    StructField("CAND_ELECTION_YR",IntegerType(),False),
    StructField("FEC_ELECTION_YR",IntegerType(),False),
    StructField("CMTE_ID",StringType(),True),
    StructField("CMTE_TP",StringType(),True),
    StructField("CMTE_DSGN",StringType(),True),
    StructField("LINKAGE_ID",IntegerType(),False)

])


cand_cmte_link = spark.read.format('csv')\
                .option('header','true')\
                .option("delimiter","|")\
                .schema(cand_cmte_link_schema)\
                .load("D:/Sohail_DE_Project/fec_env/big-data-fec-project/data/raw/candidate_committtee_linkage/ccl.txt")

In [39]:
cand_cmte_link.show(5)

+---------+----------------+---------------+---------+-------+---------+----------+
|  CAND_ID|CAND_ELECTION_YR|FEC_ELECTION_YR|  CMTE_ID|CMTE_TP|CMTE_DSGN|LINKAGE_ID|
+---------+----------------+---------------+---------+-------+---------+----------+
|C00713602|            2019|           2020|C00712851|      O|        U|    228963|
|H0AK00105|            2020|           2020|C00607515|      H|        P|    229250|
|H0AL01055|            2020|           2020|C00697789|      H|        P|    226125|
|H0AL01063|            2020|           2020|C00701557|      H|        P|    227053|
|H0AL01071|            2020|           2020|C00701409|      H|        P|    227054|
+---------+----------------+---------------+---------+-------+---------+----------+
only showing top 5 rows



In [44]:
pac_summary_schema = StructType([
    StructField("CMTE_ID", StringType(), False),                          # Not nullable
    StructField("CMTE_NM", StringType(), True),
    StructField("CMTE_TP", StringType(), True),
    StructField("CMTE_DSGN", StringType(), True),
    StructField("CMTE_FILING_FREQ", StringType(), True),
    StructField("TTL_RECEIPTS", DoubleType(), True),
    StructField("TRANS_FROM_AFF", DoubleType(), True),
    StructField("INDV_CONTRIB", DoubleType(), True),
    StructField("OTHER_POL_CMTE_CONTRIB", DoubleType(), True),
    StructField("CAND_CONTRIB", DoubleType(), True),
    StructField("CAND_LOANS", DoubleType(), True),
    StructField("TTL_LOANS_RECEIVED", DoubleType(), True),
    StructField("TTL_DISB", DoubleType(), True),
    StructField("TRANF_TO_AFF", DoubleType(), True),
    StructField("INDV_REFUNDS", DoubleType(), True),
    StructField("OTHER_POL_CMTE_REFUNDS", DoubleType(), True),
    StructField("CAND_LOAN_REPAY", DoubleType(), True),
    StructField("LOAN_REPAY", DoubleType(), True),
    StructField("COH_BOP", DoubleType(), True),
    StructField("COH_COP", DoubleType(), True),
    StructField("DEBTS_OWED_BY", DoubleType(), True),
    StructField("NONFED_TRANS_RECEIVED", DoubleType(), True),
    StructField("CONTRIB_TO_OTHER_CMTE", DoubleType(), True),
    StructField("IND_EXP", DoubleType(), True),
    StructField("PTY_COORD_EXP", DoubleType(), True),
    StructField("NONFED_SHARE_EXP", DoubleType(), True),
    StructField("CVG_END_DT", StringType(), True)                           # Needs conversion from MM/DD/YYYY
])

pac_summary= spark.read.format('csv')\
                .option('header','true')\
                .option("delimiter","|")\
                .schema(pac_summary_schema)\
                .load("D:/Sohail_DE_Project/fec_env/big-data-fec-project/data/raw/pac_summary/webk20.txt")

In [46]:
from pyspark.sql.functions import to_date

pac_summary = pac_summary.withColumn("CVG_END_DT", to_date("CVG_END_DT", "MM/dd/yyyy"))

In [47]:
pac_summary.show(5)

+---------+--------------------+-------+---------+----------------+------------+--------------+------------+----------------------+------------+----------+------------------+----------+------------+------------+----------------------+---------------+----------+----------+----------+-------------+---------------------+---------------------+--------+-------------+----------------+----------+
|  CMTE_ID|             CMTE_NM|CMTE_TP|CMTE_DSGN|CMTE_FILING_FREQ|TTL_RECEIPTS|TRANS_FROM_AFF|INDV_CONTRIB|OTHER_POL_CMTE_CONTRIB|CAND_CONTRIB|CAND_LOANS|TTL_LOANS_RECEIVED|  TTL_DISB|TRANF_TO_AFF|INDV_REFUNDS|OTHER_POL_CMTE_REFUNDS|CAND_LOAN_REPAY|LOAN_REPAY|   COH_BOP|   COH_COP|DEBTS_OWED_BY|NONFED_TRANS_RECEIVED|CONTRIB_TO_OTHER_CMTE| IND_EXP|PTY_COORD_EXP|NONFED_SHARE_EXP|CVG_END_DT|
+---------+--------------------+-------+---------+----------------+------------+--------------+------------+----------------------+------------+----------+------------------+----------+------------+------------+---

In [48]:
# reading individual contributor
ind_contr_schema =StructType([
    StructField("CMTE_ID", StringType(), False),             # Not nullable
    StructField("AMNDT_IND", StringType(), True),
    StructField("RPT_TP", StringType(), True),
    StructField("TRANSACTION_PGI", StringType(), True),
    StructField("IMAGE_NUM", StringType(), True),            # Covers VARCHAR2(11) or (18)
    StructField("TRANSACTION_TP", StringType(), True),
    StructField("ENTITY_TP", StringType(), True),
    StructField("NAME", StringType(), True),
    StructField("CITY", StringType(), True),
    StructField("STATE", StringType(), True),
    StructField("ZIP_CODE", StringType(), True),
    StructField("EMPLOYER", StringType(), True),
    StructField("OCCUPATION", StringType(), True),
    StructField("TRANSACTION_DT", DateType(), True),         # Will need parsing if in string format
    StructField("TRANSACTION_AMT", DoubleType(), True),
    StructField("OTHER_ID", StringType(), True),
    StructField("TRAN_ID", StringType(), True),
    StructField("FILE_NUM", LongType(), True),
    StructField("MEMO_CD", StringType(), True),
    StructField("MEMO_TEXT", StringType(), True),
    StructField("SUB_ID", LongType(), False)                 # Not nullable
])

import pandas as pd
headers = [
    "CMTE_ID",
    "AMNDT_IND",
    "RPT_TP",
    "TRANSACTION_PGI",
    "IMAGE_NUM",
    "TRANSACTION_TP",
    "ENTITY_TP",
    "NAME",
    "CITY",
    "STATE",
    "ZIP_CODE",
    "EMPLOYER",
    "OCCUPATION",
    "TRANSACTION_DT",
    "TRANSACTION_AMT",
    "OTHER_ID",
    "TRAN_ID",
    "FILE_NUM",
    "MEMO_CD",
    "MEMO_TEXT",
    "SUB_ID"
]


from pyspark.sql.types import StructType  # Make sure you've already defined ind_contr_schema

# Read 10% sample of the data
individual_contrib = spark.read.format("csv") \
    .option("header", "false") \
    .option("delimiter", "|") \
    .schema(ind_contr_schema) \
    .load("D:/Sohail_DE_Project/fec_env/big-data-fec-project/data/raw/individual_contribution/itcont.txt") \
    .sample(withReplacement=False, fraction=0.1, seed=42)


In [52]:
individual_contrib.rdd.getNumPartitions()

103

In [49]:
individual_contrib.show(5)

+---------+---------+------+---------------+------------------+--------------+---------+---------------+------------+-----+--------+------------------+----------+--------------+---------------+--------+-------------+--------+-------+---------+-------------------+
|  CMTE_ID|AMNDT_IND|RPT_TP|TRANSACTION_PGI|         IMAGE_NUM|TRANSACTION_TP|ENTITY_TP|           NAME|        CITY|STATE|ZIP_CODE|          EMPLOYER|OCCUPATION|TRANSACTION_DT|TRANSACTION_AMT|OTHER_ID|      TRAN_ID|FILE_NUM|MEMO_CD|MEMO_TEXT|             SUB_ID|
+---------+---------+------+---------------+------------------+--------------+---------+---------------+------------+-----+--------+------------------+----------+--------------+---------------+--------+-------------+--------+-------+---------+-------------------+
|C00618371|        A|    Q3|              P|202102099427331574|           15E|      IND| FLANNER, GREGG|     TRENTON|   NJ|   08601|FLANNER ASSOCIATES|   MFG REP|          NULL|           25.0|    NULL|SA11AI

In [50]:
individual_contrib = individual_contrib.withColumn("TRANSACTION_DT", to_date("TRANSACTION_DT", "MMddyyyy"))

## 1. Candidate Funding Analysis

### 1. Compare funding source across candidate

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType, LongType, IntegerType

In [None]:
# reading candidate committee linkage
cand_comm_schema = StructType([
    StructField("CAND_ID",StringType(),False),
    StructField("CAND_ELECTION_YR",IntegerType(),False),
    StructField("FEC_ELECTION_YR",IntegerType(),False),
    StructField("CMTE_ID",StringType(),True),
    StructField("CMTE_DSGN",StringType(),True),
    StructField("LINKAGE_ID",IntegerType(),False)

])

cand_cmte_link = spark.read.format('csv')\
                .option('header','true')\
                .schema(cand_comm_schema)\
                .load("D:/Sohail_DE_Project/fec_env/big-data-fec-project/data/raw/candidate_committtee_linkage/candidate_committtee_linkage.csv")

In [None]:
cand_cmte_link.show(5)

+---------+----------------+---------------+---------+---------+----------+
|  CAND_ID|CAND_ELECTION_YR|FEC_ELECTION_YR|  CMTE_ID|CMTE_DSGN|LINKAGE_ID|
+---------+----------------+---------------+---------+---------+----------+
|C00713602|            2019|           2020|C00712851|        O|      NULL|
|H0AK00105|            2020|           2020|C00607515|        H|      NULL|
|H0AL01055|            2020|           2020|C00697789|        H|      NULL|
|H0AL01063|            2020|           2020|C00701557|        H|      NULL|
|H0AL01071|            2020|           2020|C00701409|        H|      NULL|
+---------+----------------+---------------+---------+---------+----------+
only showing top 5 rows



### 2. Analyze the geographic distribution of support

### 3. Track Funding momentum over time