# Data Collection

In [None]:
!pip install tabula-py


In [None]:
!pip freeze

In [2]:
# prompt: To convert PDF files to CSV files in Python, you can use the tabula-py library for extracting tables from PDFs and the pandas library for working with dataframes.

import tabula
import pandas as pd

# Load the PDF file
pdf_file = '/content/1.pdf'

# Extract tables from the PDF
tables = tabula.read_pdf(pdf_file, pages='all')

# Convert the tables to pandas dataframes
df = pd.concat(tables)

# Save the dataframe to a CSV file
df.to_csv('output.csv', index=False)


In [None]:
import os
import pandas as pd
from tabula import read_pdf

def convert_pdf_to_csv(pdf_path, csv_path):
    # Use tabula to extract tables from the PDF
    tables = read_pdf(pdf_path, pages='all', multiple_tables=True)

    # Assuming you want to save all tables from the PDF into a single CSV file
    combined_df = pd.concat(tables, ignore_index=False)

    # Save the combined dataframe to a CSV file
    combined_df.to_csv(csv_path, index=False)
    print(f"Converted: {pdf_path} to {csv_path}")

def convert_all_pdfs_to_csv(folder_path):
    # List all files in the folder
    files = os.listdir(folder_path)

    # Iterate through each file
    for file in files:
        if file.endswith('.pdf'):
            # Construct the full paths for the PDF and CSV files
            pdf_path = os.path.join(folder_path, file)
            csv_name = os.path.splitext(file)[0] + '.csv'
            csv_path = os.path.join(folder_path, csv_name)

            # Convert the PDF to CSV
            convert_pdf_to_csv(pdf_path, csv_path)

# Specify the folder path containing the PDF files
folder_path = '/content/sample_data/PDF'

# Call the function to convert all PDF files to CSV in the folder
convert_all_pdfs_to_csv(folder_path)


In [None]:
# prompt: To concatenate all CSV files into a single CSV file in Python

import pandas as pd
import os

# Specify the folder path containing the CSV files
folder_path = '/content/sample_data/CSV'

# List all CSV files in the folder
files = os.listdir(folder_path)

# Create an empty DataFrame
df = pd.DataFrame()

# Iterate through each CSV file
for file in files:
    # Read the CSV file into a DataFrame
    df_temp = pd.read_csv(os.path.join(folder_path, file))

    # Concatenate the DataFrame with the empty DataFrame
    df = pd.concat([df, df_temp], ignore_index=True)

# Save the concatenated DataFrame to a new CSV file
df.to_csv('output.csv', index=False)


In [9]:
import os
import pandas as pd

def concatenate_csv_files(folder_path, output_csv):
    # List all files in the folder
    files = os.listdir(folder_path)

    # Initialize an empty DataFrame to store the combined data
    combined_df = pd.DataFrame()

    # Iterate through each file
    for file in files:
        if file.endswith('.csv'):
            # Construct the full path for the CSV file
            csv_path = os.path.join(folder_path, file)

            # Read the CSV file into a DataFrame
            df = pd.read_csv(csv_path)

            # Concatenate the DataFrame to the combined DataFrame
            combined_df = pd.concat([combined_df, df], ignore_index=True)

    # Save the combined DataFrame to a single CSV file
    combined_df.to_csv(output_csv, index=False)
    print(f"Combined all CSV files into: {output_csv}")

# Specify the folder path containing the CSV files
folder_path = '/content/sample_data/CSV'

# Specify the output CSV file
output_csv = 'output_combined.csv'

# Call the function to concatenate all CSV files into a single CSV file
concatenate_csv_files(folder_path, output_csv)


Combined all CSV files into: output_combined.csv


In [10]:
import os
import pandas as pd

def add_table_heading(csv_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_path)

    # Check if the required columns are already present
    required_columns = ["sr_no", "College_name", "cutoff"]
    missing_columns = set(required_columns) - set(df.columns)

    # Add missing columns with NaN values
    for column in missing_columns:
        df[column] = float('nan')

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csv_path, index=False)

def concatenate_csv_files(folder_path, output_csv):
    # List all files in the folder
    files = os.listdir(folder_path)

    # Initialize an empty DataFrame to store the combined data
    combined_df = pd.DataFrame()

    # Iterate through each file
    for file in files:
        if file.endswith('.csv'):
            # Construct the full path for the CSV file
            csv_path = os.path.join(folder_path, file)

            # Add table heading/features to each CSV file
            add_table_heading(csv_path)

            # Read the CSV file into a DataFrame
            df = pd.read_csv(csv_path)

            # Concatenate the DataFrame to the combined DataFrame
            combined_df = pd.concat([combined_df, df], ignore_index=True)

    # Save the combined DataFrame to a single CSV file
    combined_df.to_csv(output_csv, index=False)
    print(f"Combined all CSV files into: {output_csv}")

# Specify the folder path containing the CSV files
folder_path = '/content/sample_data/CSV'

# Specify the output CSV file
output_csv = 'output_combined.csv'

# Call the function to concatenate all CSV files into a single CSV file
concatenate_csv_files(folder_path, output_csv)


Combined all CSV files into: output_combined.csv
