In [3]:
# CODE TO CONSOLIDATE ALL EXTRACTED PRIDS INTO ONE .csv FILE

import pandas as pd
import os

# Directory containing CSV files
directory = '/home/safi/sanjay/PIB/scraped_prids'

# List all CSV files in the directory
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

# Read all CSV files and concatenate them into a single DataFrame
dfs = []
for file in csv_files:
    file_path = os.path.join(directory, file)

    # Add a check to skip empty files
    if os.path.getsize(file_path) > 0:
        df = pd.read_csv(file_path)
        dfs.append(df)
    else:
        print(f"Skipped empty file: {file_path}")

# Merge all DataFrames based on 'Main_PRID'
if dfs:
    merged_df = pd.concat(dfs, ignore_index=True)

    # Pivot the DataFrame based on 'Main_PRID' and 'Extracted_Language'
    pivot_table = merged_df.pivot_table(index='Main_PRID', columns='Extracted_Language', values='Extracted_PRID', aggfunc='first')

    # Reset index to make 'Main_PRID' a column
    pivot_table.reset_index(inplace=True)

    # Save the pivoted data to a new CSV file
    output_file = '/home/safi/sanjay/PIB/mapper/combined_scraped_prids.csv'
    pivot_table.to_csv(output_file, index=False)

    print(f"Pivoted data saved to {output_file}")
else:
    print("No data to process.")

Pivoted data saved to /home/safi/sanjay/PIB/mapping/combined_scraped_prids.csv


In [8]:
# CODE TO MAP THE PRIDS WITH THE FILE PATHS

import pandas as pd

# Read the CSV files
combined_pib_data_path = '/home/safi/sanjay/PIB/mapper/combined_scraped_prids.csv'
path_prid_path = '/home/safi/sanjay/PIB/scraper/path_prid.csv'

combined_pib_data = pd.read_csv(combined_pib_data_path)
path_prid = pd.read_csv(path_prid_path)

# Merge the two dataframes on the 'Main_PRID' column
merged_data = pd.merge(combined_pib_data, path_prid, left_on='Main_PRID', right_on='PRID', how='left')

# Create a list to store the mappings
mappings = []

# Iterate through the merged data and populate the mappings list
for index, row in merged_data.iterrows():
    prid = row['Main_PRID']
    main_file_path = row['File Path']
    
    # Iterate through languages and their columns in the dataframe
    for lang in combined_pib_data.columns[2:]:
        lang_prid = row[lang]
        if pd.notnull(lang_prid):  # Check if the language PRID exists
            lang_file_paths = path_prid[path_prid['PRID'] == lang_prid]['File Path'].values
            if len(lang_file_paths) > 0:
                lang_file_path = lang_file_paths[0]  # Take the first matching path

                # Create a dictionary for each mapping
                mapping = {
                    'Main_PRID': prid,
                    'Main_File_Path': main_file_path,
                    'Language': lang,
                    'Language_PRID': lang_prid,
                    'Language_File_Path': lang_file_path,
                }

                mappings.append(mapping)

# Convert the mappings list to a pandas DataFrame
mappings_df = pd.DataFrame(mappings)

# Write the mappings to a new CSV file
output_csv_path = '/home/safi/sanjay/PIB/mapper/initial_map.csv'
mappings_df.to_csv(output_csv_path, index=False)

print(f"CSV data saved to {output_csv_path}")

CSV data saved to /home/safi/sanjay/PIB/mapping/initial_map.csv


In [None]:
#RUN .py file of this for bigger ranges
# FINAL CODE TO EXTRACT TEXT FROM THE FILE PATH AND STORE IN SEPERATE JSON

import pandas as pd
import json
import os
import zipfile
import shutil

# Function to extract text from JSON file
def extract_text(file_path):
    if pd.notnull(file_path) and os.path.exists(str(file_path)):
        with open(str(file_path), 'r') as json_file:
            content = json.load(json_file)
            return content.get('text', '')
    else:
        return ''  # Return an empty string if file path is missing or file doesn't exist

# Path to your CSV file
csv_file_path = '/home/mtech_22/sanjay/PIB/initial_map_modified.csv'
output_directory = '/home/mtech_22/sanjay/pib_final'
zip_file_name = '/home/mtech_22/sanjay/pib_final.zip'

# Read the entire CSV file (remove nrows parameter)
data = pd.read_csv(csv_file_path)

# Create a temporary directory to store JSON files
temp_directory = '/home/mtech_22/sanjay/temp_json'
os.makedirs(temp_directory, exist_ok=True)

# Group data by 'Main_PRID' column
grouped_data = data.groupby('Main_PRID')

# Create separate JSON files for each 'Main_PRID' containing all language texts
for group_name, group_df in grouped_data:
    json_data = {'Main_PRID': group_name, 'Main_PRID_Language': '', 'Main_PRID_Text': '', 'Languages': []}

    # Get unique Language_PRIDs in the group
    unique_language_prids = group_df['Language_PRID'].unique()

    # Extract text for each language within the group
    for language_prid in unique_language_prids:
        language_df = group_df[group_df['Language_PRID'] == language_prid].iloc[0]  # Get the first row for the language
        language_details = {
            'Language': language_df['Language'],
            'Language_PRID': language_df['Language_PRID'],
            'Text': extract_text(language_df['Language_File_Path'])  # Extract text using the provided file path
        }
        json_data['Languages'].append(language_details)

    # Extract and add text associated with Main_PRID
    main_prid_text = extract_text(group_df['Main_File_Path'].iloc[0])
    json_data['Main_PRID_Language'] = ''  # Add an empty element for Main_PRID_Language
    json_data['Main_PRID_Text'] = main_prid_text

    # Write the JSON data to a separate file for the Main_PRID in the temporary directory
    output_file_path = os.path.join(temp_directory, f"data_{group_name}.json")
    with open(output_file_path, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

# Create a ZIP file containing all JSON files
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    for foldername, subfolders, filenames in os.walk(temp_directory):
        for filename in filenames:
            file_path = os.path.join(foldername, filename)
            zipf.write(file_path, os.path.relpath(file_path, temp_directory))

# Remove the temporary directory and its contents
shutil.rmtree(temp_directory)

In [None]:
# CODE TO UNZIP THE FILE FOR "langdetect"
import zipfile
import os

def unzip_file(zip_path, extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

if __name__ == "__main__":
    zip_path = "/home/mtech_22/sanjay/finally.zip"
    extract_path = "/home/mtech_22/sanjay/unzipped_files"

    # Create the directory if it doesn't exist
    os.makedirs(extract_path, exist_ok=True)

    unzip_file(zip_path, extract_path)

In [None]:
# USING "langdetect" TO GET THE MAIN_PRID LANGUAGE AND UPDATE IN THE SAME .json FILE

import os
import json
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def process_json_file(json_data):
    main_prid_text = json_data.get("Main_PRID_Text", "")
    language = detect_language(main_prid_text)
    
    # Check if "Main_PRID_Language" key is present
    if "Main_PRID_Language" in json_data:
        json_data["Main_PRID_Language"] = language

def detect_language(text):
    try:
        # Attempt to detect language
        lang_code = detect(text)
        
        # Map language code to language name
        lang_name = lang_code_to_name(lang_code)
        
        return lang_name
    except LangDetectException:
        # Handle exceptions, e.g., if text is too short
        return "Unknown"

def lang_code_to_name(code):
    # Language mapping for Indian languages
    indian_language_mapping = {
        "en": "English",
        "bn": "Bengali",
        "gu": "Gujarati",
        "hi": "Hindi",
        "kn": "Kannada",
        "ml": "Malayalam",
        "mr": "Marathi",
        "ne": "Nepali",
        "pa": "Punjabi",
        "ta": "Tamil",
        "te": "Telugu",
        "ur": "Urdu",
    }
    return indian_language_mapping.get(code, "Unknown")


def process_directory(directory_path):
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as json_file:
                    json_data = json.load(json_file)
                    process_json_file(json_data)
                    
                    # Save the modified JSON back to the file
                    with open(file_path, 'w', encoding='utf-8') as updated_json_file:
                        json.dump(json_data, updated_json_file, indent=4, ensure_ascii=False)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in {file_path}: {e}")
                continue

if __name__ == "__main__":
    directory_path = "/home/mtech_22/sanjay/unzipped_files"

    process_directory(directory_path)
    print("Language detection and modification completed.")