Loading the Dataset

In [None]:
# Open the CSV file
with open('/paylevel_texas.csv', 'r') as file:
    # Read lines from the file
    lines = file.readlines()

# Extract column names
column_names = lines[0].strip().split(',')

# Initialize an empty list to store data records
data = []

# Iterate over lines to extract data records
for line in lines[1:]:
    record = line.strip().split(',')
    data.append(record)

# Display column names and the first few data records
print("Column Names:", column_names)
print("First few data records:", data[:5])


Column Names: ['AGY', 'NAME', 'JOBCLASS', 'JC TITLE', 'RACE', 'SEX', 'EMPTYPE', 'HIREDT', 'RATE', 'HRSWKD', 'MONTHLY', 'ANNUAL']
First few data records: [['537', 'DEPARTMENT OF STATE HEALTH SERVICES               ', '152', 'ADMINISTRATIVE ASSISTANT II                       ', 'HISPANIC       ', 'FEMALE         ', 'CRF - CLASSIFIED REGULAR FULL-TIME      ', '8/8/2022', '0', '40', '2820.29', '33843.48'], ['101', 'SENATE                                            ', '7103', 'LEG. SERVICE/MAINTENANCE                          ', 'HISPANIC       ', 'FEMALE         ', 'URF - UNCLASSIFIED REGULAR FULL-TIME    ', '1/2/2019', '0', '41', '3250.72', '39008.64'], ['529', 'HEALTH AND HUMAN SERVICES COMMISSION              ', '1860', 'MANAGEMENT ANALYST I                              ', 'WHITE          ', 'FEMALE         ', 'CRF - CLASSIFIED REGULAR FULL-TIME      ', '6/13/2022', '0', '40', '4625', '55500'], ['104', 'LEGISLATIVE BUDGET BOARD                          ', 'P070    ', 'ANALYST           

Cleaning the dataset

In [None]:
# Remove leading/trailing whitespaces and split data records
cleaned_data = [[value.strip() for value in record] for record in data]

# Handling missing values: Replace empty strings with None
cleaned_data = [[None if value == '' else value for value in record] for record in cleaned_data]

# Display cleaned data
print("Cleaned Data:", cleaned_data[:5])


Cleaned Data: [['537', 'DEPARTMENT OF STATE HEALTH SERVICES', '152', 'ADMINISTRATIVE ASSISTANT II', 'HISPANIC', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '8/8/2022', '0', '40', '2820.29', '33843.48'], ['101', 'SENATE', '7103', 'LEG. SERVICE/MAINTENANCE', 'HISPANIC', 'FEMALE', 'URF - UNCLASSIFIED REGULAR FULL-TIME', '1/2/2019', '0', '41', '3250.72', '39008.64'], ['529', 'HEALTH AND HUMAN SERVICES COMMISSION', '1860', 'MANAGEMENT ANALYST I', 'WHITE', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '6/13/2022', '0', '40', '4625', '55500'], ['104', 'LEGISLATIVE BUDGET BOARD', 'P070', 'ANALYST', 'WHITE', 'FEMALE', 'URP - UNCLASSIFIED REGULAR PART-TIME', '8/1/2021', '0', '30', '6000', '72000'], ['101', 'SENATE', '7104', 'LEGISLATIVE PROFESSIONAL', 'WHITE', 'FEMALE', 'URP - UNCLASSIFIED REGULAR PART-TIME', '8/1/2020', '0', '10', '2666.67', '32000.04']]


Type Conversion

In [None]:
# Convert numeric columns to float
numeric_columns = [9, 10, 11, 12]  # Assuming the columns start from 0 index
for record in cleaned_data:
    for idx in numeric_columns:
        # Check if the index is within the bounds of the list
        if 0 <= idx < len(record):
            try:
                record[idx] = float(record[idx])
            except (ValueError, TypeError):
                # If conversion fails, handle the error (e.g., print a message or set to None)
                print(f"Unable to convert '{record[idx]}' to float in column {idx}.")
        else:
            print(f"Index {idx} is out of range for record {record}.")

# Display first few data records to verify changes
print("First few data records after conversion:")
for record in cleaned_data[:5]:
    print(record)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Index 12 is out of range for record ['696', 'TEXAS DEPARTMENT OF CRIMINAL JUSTICE', '4505', 'CORREC  OFFICER V', 'WHITE', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '5/22/2014', '0', 40.0, 4278.2, 51338.4].
Index 12 is out of range for record ['529', 'HEALTH AND HUMAN SERVICES COMMISSION', '1575', 'PROGRAM SPECIALIST VI', 'WHITE', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '5/6/2019', '0', 40.0, 5050.63, 60607.56].
Index 12 is out of range for record ['529', 'HEALTH AND HUMAN SERVICES COMMISSION', '5053', 'REHAB THERAPY TECHNICIAN IV', 'WHITE', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '9/1/2017', '0', 40.0, 2959.09, 35509.08].
Index 12 is out of range for record ['802', 'PARKS AND WILDLIFE DEPARTMENT', '1552', 'STAFF SRVCS OFFCR III', 'AM INDIAN', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '8/7/2000', '0', 40.0, 4262.61, 51151.32].
Index 12 is out of range for record ['696', 'TEXAS DEPARTMENT OF CRIM

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Index 12 is out of range for record ['696', 'TEXAS DEPARTMENT OF CRIMINAL JUSTICE', '4540', 'PAROLE OFFCR I', 'BLACK', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '7/13/2022', '0', 40.0, 3475.35, 41704.2].
Index 12 is out of range for record ['530', 'DEPARTMENT OF FAMILY AND PROTECTIVE SERVICES', '5016', 'FAMILY & PROTECT SCVS SUPR I', 'HISPANIC', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '5/26/2015', '0', 40.0, 5207.24, 62486.88].
Index 12 is out of range for record ['530', 'DEPARTMENT OF FAMILY AND PROTECTIVE SERVICES', '5505', 'HUMAN SERVICES TECHNICIAN III', 'HISPANIC', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '6/1/2021', '0', 40.0, 2272.41, 27268.92].
Index 12 is out of range for record ['582', 'TEXAS COMMISSION ON ENVIRONMENTAL QUALITY', '2685', 'NATURAL RESOURCES SPEC IV', 'HISPANIC', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '1/22/2013', '0', 40.0, 4575.94, 54911.28].
Index 12 is out of range for record ['304', 'COMPTROLLER OF PUBLIC ACCOUNTS', '1281', 'T

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Index 12 is out of range for record ['802', 'PARKS AND WILDLIFE DEPARTMENT', '2684', 'NATURAL RESOURCES SPEC III', 'WHITE', 'MALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '6/1/2004', '0', 40.0, 5282.67, 63392.04].
Index 12 is out of range for record ['601', 'TEXAS DEPARTMENT OF TRANSPORTATION', '9307', 'TRANS MAINT SPEC III', 'HISPANIC', 'MALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '10/8/2018', '0', 40.0, 3984.75, 47817.0].
Index 12 is out of range for record ['529', 'HEALTH AND HUMAN SERVICES COMMISSION', '1574', 'PROGRAM SPECIALIST V', 'WHITE', 'MALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '11/30/2020', '0', 40.0, 4304.78, 51657.36].
Index 12 is out of range for record ['529', 'HEALTH AND HUMAN SERVICES COMMISSION', '256', 'SYSTEMS ANALYST V', 'WHITE', 'MALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '12/14/2020', '0', 40.0, 8049.99, 96599.88].
Index 12 is out of range for record ['529', 'HEALTH AND HUMAN SERVICES COMMISSI

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Index 12 is out of range for record ['529', 'HEALTH AND HUMAN SERVICES COMMISSION', '5624', 'TEXAS WORKS ADVISOR III', 'HISPANIC', 'MALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '9/1/2004', '0', 40.0, 4094.5, 49134.0].
Index 12 is out of range for record ['530', 'DEPARTMENT OF FAMILY AND PROTECTIVE SERVICES', '1412', 'QUALITY ASSURANCE SPEC III', 'WHITE', 'MALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '1/1/2002', '0', 40.0, 5278.29, 63339.48].
Index 12 is out of range for record ['809', 'STATE PRESERVATION BOARD', '1621', 'DIRECTOR II', 'OTHER', 'MALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '3/29/2004', '0', 40.0, 8890.28, 106683.36].
Index 12 is out of range for record ['802', 'PARKS AND WILDLIFE DEPARTMENT', '2686', 'NATURAL RESOURCES SPEC V', 'WHITE', 'MALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '4/1/2007', '0', 40.0, 6646.74, 79760.88].
Index 12 is out of range for record ['701', 'TEXAS EDUCATION AGENCY', '1606', 'MANAGER VII', 'BLACK', 'MALE', 'CRF - CLASSIFIED REGULAR FULL-TIME',

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Index 12 is out of range for record ['530', 'DEPARTMENT OF FAMILY AND PROTECTIVE SERVICES', '5233', 'VOLUNTEER SERVICES COORD II', 'HISPANIC', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '3/1/2009', '0', 40.0, 3102.61, 37231.32].
Index 12 is out of range for record ['696', 'TEXAS DEPARTMENT OF CRIMINAL JUSTICE', '4502', 'CORREC OFFCR II', 'WHITE', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '9/7/2022', '0', 40.0, 3472.82, 41673.84].
Index 12 is out of range for record ['302', 'OFFICE OF THE ATTORNEY GENERAL', '1582', 'PROGRAM SUPERVISOR III', 'HISPANIC', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '2/1/2005', '0', 40.0, 5192.88, 62314.56].
Index 12 is out of range for record ['696', 'TEXAS DEPARTMENT OF CRIMINAL JUSTICE', '1915', 'INVENTORY AND STORE SPEC V', 'AM INDIAN', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '6/23/2014', '0', 40.0, 3604.2, 43250.4].
Index 12 is out of range for record ['537', 'DEPARTMENT OF STATE HEALTH SERVICES', '1572', 'PROGRAM SPECIALIST III'

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Index 12 is out of range for record ['696', 'TEXAS DEPARTMENT OF CRIMINAL JUSTICE', '4504', 'CORREC  OFFICER IV', 'HISPANIC', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '10/9/2017', '0', 40.0, 4002.89, 48034.68].
Index 12 is out of range for record ['320', 'TEXAS WORKFORCE COMMISSION', '5702', 'H/SRVC SPEC III', 'HISPANIC', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '9/1/2016', '0', 40.0, 3864.37, 46372.44].
Index 12 is out of range for record ['529', 'HEALTH AND HUMAN SERVICES COMMISSION', '1570', 'PROGRAM SPECIALIST I', 'HISPANIC', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '10/1/2018', '0', 40.0, 3081.33, 36975.96].
Index 12 is out of range for record ['529', 'HEALTH AND HUMAN SERVICES COMMISSION', '1202', 'INTERNAL AUDITOR III', 'HISPANIC', 'FEMALE', 'CRF - CLASSIFIED REGULAR FULL-TIME', '12/9/2013', '0', 40.0, 5794.6, 69535.2].
Index 12 is out of range for record ['302', 'OFFICE OF THE ATTORNEY GENERAL

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Saving the cleaned data

In [None]:
# Write cleaned data to a new CSV file
with open('cleaned_dataset.csv', 'w') as file:
    # Write column names
    file.write(','.join(column_names) + '\n')
    # Write data records
    for record in cleaned_data:
        file.write(','.join(map(str, record)) + '\n')

print("Cleaned dataset saved successfully.")


Cleaned dataset saved successfully.
