## extracting our required entities from raw data 

In [6]:
import pandas as pd
import os

# List of values to keep
values_to_keep = [
    "employerName", "employerAddressStreet_name", "employerAddressCity",
    "employerAddressState", "employerAddressZip", "einEmployerIdentificationNumber",
    "employeeName", "ssnOfEmployee", "box1WagesTipsAndOtherCompensations",
    "box2FederalIncomeTaxWithheld", "box3SocialSecurityWages",
    "box4SocialSecurityTaxWithheld", "box16StateWagesTips",
    "box17StateIncomeTax", "taxYear"
]

# Folder paths
folder_path = r'C:\Users\HP\Downloads\proj\boxes_transcripts_labels'  # Input TSV folder
output_folder_path = r'C:\Users\HP\Downloads\proj\individual'  # Output Excel folder

# Create output folder if it doesn't exist
os.makedirs(output_folder_path, exist_ok=True)

# Loop through each file in the specified folder
for file in os.listdir(folder_path):
    if file.endswith('.tsv'):
        print(f"Processing file: {file}")  # Debugging statement
        
        # Combine the folder path and file name
        filePath = os.path.join(folder_path, file)
        
        # Read the file, treating it as a CSV (comma-separated file)
        df = pd.read_csv(filePath, sep=',', header=None)

        # Check if the file has the expected 8 columns
        expected_columns = ['start_index', 'end_index', 'x_top_left', 'y_top_left', 
                            'x_bottom_right', 'y_bottom_right', 'transcript', 'field']
        
        if df.shape[1] != len(expected_columns):
            print(f"Skipping {file} due to unexpected column count: {df.shape[1]}")
            continue
        
        # Assign column names
        df.columns = expected_columns

        # Verify required columns exist before proceeding
        if 'field' not in df.columns or 'transcript' not in df.columns:
            print(f"Skipping {file} due to missing required columns.")
            continue

        # Filter the DataFrame to keep only rows where 'field' is in values_to_keep
        df_filtered = df[df["field"].isin(values_to_keep)]

        # Initialize a list to store grouped DataFrames
        grouped_data_frames = []

        # Group by 'field' and concatenate 'transcript'
        for field in values_to_keep:
            if field in df_filtered['field'].values:  # Check if field exists in the filtered DataFrame
                newDataFrame = df_filtered[df_filtered['field'] == field].groupby('field').agg({
                    'transcript': lambda x: ' '.join(x).strip()  # Concatenate and strip whitespace
                }).reset_index()
                
                # Remove spaces within the concatenated text
                newDataFrame['transcript'] = newDataFrame['transcript'].apply(lambda x: x.replace(' ', '').strip())
                
                # Append the grouped DataFrame to the list
                grouped_data_frames.append(newDataFrame)

        # Create a new DataFrame if there are any grouped data
        if grouped_data_frames:
            df_combined = pd.concat(grouped_data_frames, ignore_index=True)

            # Create a new Excel file in the specified output folder
            excel_file_path = os.path.join(output_folder_path, f"{os.path.splitext(file)[0]}.xlsx")

            # Write the combined DataFrame to a new Excel file
            df_combined.to_excel(excel_file_path, index=False)
            print(f"Excel file created: {excel_file_path}")
        else:
            print(f"No relevant data to save for {file}.")

print("Processing complete.")

# -----------------------------------------------
# PART 2: Combine extracted Excel files into one
# -----------------------------------------------

# Define the list of columns to extract
columns = [
    "employerName", "employerAddressStreet_name", "employerAddressCity", 
    "employerAddressState", "employerAddressZip", "einEmployerIdentificationNumber", 
    "employeeName", "ssnOfEmployee", "box1WagesTipsAndOtherCompensations", 
    "box2FederalIncomeTaxWithheld", "box3SocialSecurityWages", 
    "box4SocialSecurityTaxWithheld", "box16StateWagesTips", 
    "box17StateIncomeTax", "taxYear"
]

# Initialize an empty DataFrame to store results
result_df = pd.DataFrame(columns=columns)

# Path where your Excel files are located
path = output_folder_path

# Loop through all Excel files in the specified directory
for filename in os.listdir(path):
    if filename.endswith('.xlsx'):
        # Read the Excel file
        file_path = os.path.join(path, filename)
        df = pd.read_excel(file_path)

        # Check if 'field' and 'transcript' columns exist
        if 'field' not in df.columns or 'transcript' not in df.columns:
            print(f"Skipping {filename} due to missing columns.")
            continue

        # Initialize a new row for the new employee (for each file)
        new_row = {col: None for col in columns}  # Initialize with None for all columns
        
        # Iterate row-wise in the current file
        for _, row in df.iterrows():
            currField = row['field']
            currTranscript = row['transcript']
            
            # Ensure transcript is a string and clean it
            if isinstance(currTranscript, str):
                currTranscript = currTranscript.strip()

            # Set the corresponding field value for the current file's row
            if currField in columns:
                new_row[currField] = currTranscript
        
        # Convert the new_row dictionary into a DataFrame
        new_row_df = pd.DataFrame([new_row])

        # Concatenate the new_row_df with result_df
        result_df = pd.concat([result_df, new_row_df], ignore_index=True)
        
# Convert numeric columns to float (if needed)
numeric_columns = [
    "box1WagesTipsAndOtherCompensations", "box2FederalIncomeTaxWithheld", 
    "box3SocialSecurityWages", "box4SocialSecurityTaxWithheld", 
    "box16StateWagesTips", "box17StateIncomeTax"
]

# Clean the numeric data before converting
for col in numeric_columns:
    result_df[col] = result_df[col].replace({'\$': '', ',': '', ' ': ''}, regex=True)  # Strip spaces, commas, and dollar signs
    result_df[col] = pd.to_numeric(result_df[col], errors='coerce')  # Convert to numeric

# Save the resulting DataFrame to an Excel file
output_file_path = os.path.join(path, 'result.xlsx')
result_df.to_excel(output_file_path, index=False)

print(f"Processing complete. File saved to {output_file_path}.")


Processing file: number1.tsv
Excel file created: C:\Users\HP\Downloads\proj\individual\number1.xlsx
Processing file: number10.tsv
Excel file created: C:\Users\HP\Downloads\proj\individual\number10.xlsx
Processing file: number100.tsv
Excel file created: C:\Users\HP\Downloads\proj\individual\number100.xlsx
Processing file: number101.tsv
Excel file created: C:\Users\HP\Downloads\proj\individual\number101.xlsx
Processing file: number102.tsv
Excel file created: C:\Users\HP\Downloads\proj\individual\number102.xlsx
Processing file: number103.tsv
Excel file created: C:\Users\HP\Downloads\proj\individual\number103.xlsx
Processing file: number104.tsv
Excel file created: C:\Users\HP\Downloads\proj\individual\number104.xlsx
Processing file: number105.tsv
Excel file created: C:\Users\HP\Downloads\proj\individual\number105.xlsx
Processing file: number106.tsv
Excel file created: C:\Users\HP\Downloads\proj\individual\number106.xlsx
Processing file: number107.tsv
Excel file created: C:\Users\HP\Downloa

In [7]:
import pandas as pd

In [11]:
df = pd.read_excel(r'C:\Users\HP\Downloads\proj\individual\result.xlsx')
df

Unnamed: 0,employerName,employerAddressStreet_name,employerAddressCity,employerAddressState,employerAddressZip,einEmployerIdentificationNumber,employeeName,ssnOfEmployee,box1WagesTipsAndOtherCompensations,box2FederalIncomeTaxWithheld,box3SocialSecurityWages,box4SocialSecurityTaxWithheld,box16StateWagesTips,box17StateIncomeTax,taxYear
0,Collins-SaundersandSons,9652ScottGrovesApt.116,Rodriguezmouth,NE,70838-1080,37-3493491,StephanieDawson,720-74-9502,41669.07,11182.930000,53826.13,4117.70,2.028785e+04,1690.44,2019.0
1,CabreraGroupLtd,5553ThomasPassageApt.705,WestTheodore,Co,56221-5487,50-4141334,MelissaAnderson,,214731.04,47393.740000,211281.72,16163.05,1.210590e+09,,2018.0
2,George-HortonandSons,55527ShannonCorner,Alisonfort,LA,24369-8354,43-1414035,RodneyPerez,475-90-1539,240614.35,79673.630000,266460.93,20384.26,1.208543e+05,8990.12,2019.0
3,Munoz-JohnsonLtd,286SmithCircle,NorthJuliaview,VA,85011-4757,71-8393024,GaryFisher,638-62-9924,241992.09,30996.510000,270761.69,20713.27,1.224533e+08,13763.03,2019.0
4,"Clark,BrownandRiveraLLC",9446SamanthaSprings,Williamsstad,NC,25420-6106,90-9375060,NicholeSutton,475-13-3309,175675.50,38786.940000,124567.41,9529.41,9.300751e+04,4515.21,2019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,WalkerGroupPLC,88429AnnetteFields,SouthJoshuafort,PA,34492-5445,42-1551597,MeredithLee,,86860.24,10064.600000,64063.69,4900.87,4.756493e+04,2880.29,2018.0
596,Decker-SmithPLC,784PriceBrookApt.000,Marksstad,ME,92936-1367,30-9755097,Mrs.Samantha,numDog-2077,19087958.00,0.537333,,,8.378236e+06,,2018.0
597,,35657JamesCourts,WestJasmine,NM,04093-1257,68-7884298,SaraDean,708-85-6052,62278.33,,,,,-283239.00,2018.0
598,"Tyler,NelsonandBenderInc",5654ParkerView,NorthSara,AZ,50949-7371,98-3866411,TiffanySimmons,0,110503.15,15540.510000,82512.35,6312.19,5.617204e+04,5554.06,


In [12]:
print(df.dtypes)


employerName                           object
employerAddressStreet_name             object
employerAddressCity                    object
employerAddressState                   object
employerAddressZip                     object
einEmployerIdentificationNumber        object
employeeName                           object
ssnOfEmployee                          object
box1WagesTipsAndOtherCompensations    float64
box2FederalIncomeTaxWithheld          float64
box3SocialSecurityWages               float64
box4SocialSecurityTaxWithheld         float64
box16StateWagesTips                   float64
box17StateIncomeTax                   float64
taxYear                               float64
dtype: object
