In [6]:
import os
import pandas as pd
import numpy as np
import folium

In [17]:
import os

def count_files_in_folder(folder_path):
    if not os.path.isdir(folder_path):
        print("Invalid folder path.")
        return
    
    file_count = 0
    for _, _, files in os.walk(folder_path):
        file_count += len(files)
    
    print(f"Number of files in {folder_path}: {file_count}")

# Example usage:
folder_path = '/Users/gayathri/Desktop/Format_0'
count_files_in_folder(folder_path)


Number of files in /Users/gayathri/Desktop/Format_0: 119


In [18]:
#This is to generate sub_bart_label files for each domain
def process_csv(csv_file, output_folder):
    data = pd.read_csv(csv_file)
    grouped = data.groupby('bart_label')
    for bart_label, group in grouped:
        filename = os.path.join(output_folder, f'{os.path.splitext(os.path.basename(csv_file))[0]}_{bart_label.lower()}.csv')  
        group.to_csv(filename, index=False)

folder_path = '/Users/gayathri/Desktop/Format_0'
output_folder = '/Users/gayathri/Desktop/Format_1'
os.makedirs(output_folder, exist_ok=True)

# Iterate through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        csv_file = os.path.join(folder_path, file_name)
        process_csv(csv_file, output_folder)

In [None]:
###This is to put all the files of same domain to one folder###
import os
import shutil

def organize_csv_files(folder_path):
    # Create a dictionary to store folder paths for each prefix
    folder_paths = {}
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".csv"):
            prefix = file_name.split('_')[0]
            
            if prefix not in folder_paths:
                folder_paths[prefix] = os.path.join(folder_path, prefix)
                folder_path_suffix = 0
                while os.path.exists(folder_paths[prefix]):
                    folder_path_suffix += 1
                    folder_paths[prefix] = os.path.join(folder_path, f"{prefix}_{folder_path_suffix}")
                os.makedirs(folder_paths[prefix], exist_ok=True)
            
            src_file = os.path.join(folder_path, file_name)
            dst_file = os.path.join(folder_paths[prefix], file_name)
            shutil.move(src_file, dst_file)

folder_path = '/Users/gayathri/Desktop/Format_1'

organize_csv_files(folder_path)

In [22]:
##This step also acts like a verification
## 1. To add all data files accordingly to each sub-domain folder, this also ensures that there are sub-folders existing for each of the domains.

import os
import shutil

def copy_files_to_matching_subfolder(folder_1, folder_2):
    for filename in os.listdir(folder_1):
        if filename.endswith(".csv"):
            csv_file_path = os.path.join(folder_1, filename)
            subfolder_name = os.path.splitext(filename)[0]  
            subfolder_path = os.path.join(folder_2, subfolder_name)

            # Check if the subfolder exists in folder_2
            if os.path.isdir(subfolder_path):
                new_filename = os.path.splitext(filename)[0] + "_all.csv"
                destination_path = os.path.join(subfolder_path, new_filename)
                shutil.copyfile(csv_file_path, destination_path)
                print(f"File {filename} copied to {destination_path}")
            else:
                print(f"No matching subfolder found for {filename}")

folder_1 = "/Users/gayathri/Desktop/Format_0"
folder_2 = "/Users/gayathri/Desktop/Format_1"
copy_files_to_matching_subfolder(folder_1, folder_2)


File 920thejersey.csv copied to /Users/gayathri/Desktop/Format_1/920thejersey/920thejersey_all.csv
File thepakistaninewspaper.csv copied to /Users/gayathri/Desktop/Format_1/thepakistaninewspaper/thepakistaninewspaper_all.csv
File mcccvoice.csv copied to /Users/gayathri/Desktop/Format_1/mcccvoice/mcccvoice_all.csv
File stocktonargo.csv copied to /Users/gayathri/Desktop/Format_1/stocktonargo/stocktonargo_all.csv
File koreadailyus.csv copied to /Users/gayathri/Desktop/Format_1/koreadailyus/koreadailyus_all.csv
No matching subfolder found for 987thecoast.csv
File wurdradio.csv copied to /Users/gayathri/Desktop/Format_1/wurdradio/wurdradio_all.csv
File wqxr.csv copied to /Users/gayathri/Desktop/Format_1/wqxr/wqxr_all.csv
File civicstory.csv copied to /Users/gayathri/Desktop/Format_1/civicstory/civicstory_all.csv
File jerseyvoices.csv copied to /Users/gayathri/Desktop/Format_1/jerseyvoices/jerseyvoices_all.csv
File wpst.csv copied to /Users/gayathri/Desktop/Format_1/wpst/wpst_all.csv
File ho

In [None]:
## 987thecoast has no data in it and hence no folder formed

In [26]:
### Let us count the number of different sub-bart files present for each domain

import os
import pandas as pd

def get_unique_bart_labels(folder_path):
    data = []
    for root, dirs, files in os.walk(folder_path):
        unique_labels = set()
        domain_name = os.path.basename(root)

        for file_name in files:
            if file_name.endswith(".csv"):
                bart_label = os.path.splitext(file_name)[0].replace(domain_name + "_", "")
                unique_labels.add(bart_label)

        data.append({'domain_name': domain_name, 'unique_bart_labels': len(unique_labels), 'bart_labels': ', '.join(unique_labels)})

    return data

def write_bart_labels_to_csv(folder_path, output_csv):
    data = get_unique_bart_labels(folder_path)
    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False)

folder_path = '/Users/gayathri/Desktop/Format_1'  
output_csv = '/Users/gayathri/Desktop/domain_label_details.csv'  
write_bart_labels_to_csv(folder_path, output_csv)

In [31]:
import os
import pandas as pd
import numpy as np
import folium

def process_csv_file(csv_file, color):


        for index, row in df.iterrows():
            location = row["gpe"]
            latitude = row["gpe_latitude"]
            longitude = row["gpe_longitude"]
            count = row["gpe_sum"]
            log_count = row["gpe_sum_log"]
            bart_label = row["bart_label"]
            popup_text = f"Location: {location}\nCount: {count}\nLabel: {bart_label}"
            marker_size = get_marker_size(log_count)
            folium.CircleMarker(
                [latitude, longitude],
                popup=popup_text,
                radius=marker_size,
                color=color,
                fill=True,
                fill_color=color
            ).add_to(mymap)

        folder_path = os.path.dirname(csv_file)
        filename = os.path.splitext(os.path.basename(csv_file))[0] + ".html"
        output_file = os.path.join(folder_path, filename)
        mymap.save(output_file)

def process_subfolders(folder_path, color_palette):
    color_index = 0
    for root, dirs, files in os.walk(folder_path):
        for dir_name in dirs:
            subfolder_path = os.path.join(root, dir_name)
            for filename in os.listdir(subfolder_path):
                if filename.endswith(".csv"):
                    csv_file = os.path.join(subfolder_path, filename)
                    color = color_palette[color_index % len(color_palette)]
                    process_csv_file(csv_file, color)
                    color_index += 1

# Example usage:
folder_path = "/Users/gayathri/Desktop/Format_1"
color_palette = ['#e6194b', '#3cb44b', '#194fff', '#873ec7', '#de6109', 
                 '#911eb4', '#f7890a', '#f032e6', '#ce7e00', '#6a329f']

process_subfolders(folder_path, color_palette)

  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 +

In [32]:
import os
import pandas as pd
import numpy as np
import folium

def generate_map(csv_file):
    df = pd.read_csv(csv_file)
    # Filter out rows where 'gpe' is in the specified exclusion list
    exclusion_list = ['New Jersey', 'NJ', 'The Garden State', 'Nj', 'New Jerseyans',' US','U.S', 'U.S.', 'USA', 'U.S.A', 'United States', 'United States of America', 'The United States']
    df = df[~df['gpe'].isin(exclusion_list)]
    
    # Proceed if df is not empty after filtering
    if not df.empty:
        df['gpe_sum_log'] = np.log(df['gpe_sum']) / np.log(np.e)  
        map_zoom = 10
        max_occurrences_row = df.loc[df['gpe_sum'].idxmax()]
        highest_occurrences_latitude = max_occurrences_row['gpe_latitude']
        highest_occurrences_longitude = max_occurrences_row['gpe_longitude']
        highest_occurrence = [highest_occurrences_latitude, highest_occurrences_longitude]
        mymap = folium.Map(location=highest_occurrence, zoom_start=map_zoom)
        max_count = df["gpe_sum_log"].max()

        def get_marker_size(log_count):
            scale_factor = 10  
            return 5 + (log_count / max_count) * scale_factor

        colors = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', 
                  '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080']

        bart_label_color_map = dict(zip(df['bart_label'].unique(), colors))

        for index, row in df.iterrows():
            location = row["gpe"]
            latitude = row["gpe_latitude"]
            longitude = row["gpe_longitude"]
            count = row["gpe_sum"]
            log_count = row["gpe_sum_log"]
            bart_label = row["bart_label"]
            popup_text = f"Location: {location}\nCount: {count}\nLabel: {bart_label}"
            marker_size = get_marker_size(log_count)
            color = bart_label_color_map.get(bart_label, "#FF0000")  
            folium.CircleMarker(
                [latitude, longitude],
                popup=popup_text,
                radius=marker_size,
                color=color,
                fill=True,
                fill_color=color
            ).add_to(mymap)

        legend_html = """
             <div style="position: fixed; 
                    bottom: 50px; left: 50px; width: 200px; min-height: 150px; 
                    border:2px solid grey; z-index:9999; font-size:14px;
                    background-color: white; padding: 5px;
                        ">
             &nbsp; <strong>Legend</strong> <br>
             """
        for label, color in bart_label_color_map.items():
            legend_html += f"&nbsp; <i class='fa fa-circle' style='color:{color}'></i> {label}<br>"
        legend_html += """
             </div>
             """
        mymap.get_root().html.add_child(folium.Element(legend_html))
        filename = os.path.splitext(os.path.basename(csv_file))[0] + ".html"
        output_path = os.path.join(os.path.dirname(csv_file), filename)
        mymap.save(output_path)

def generate_maps_in_subfolders(folder_path):
    for root, dirs, files in os.walk(folder_path):
        for dir_name in dirs:
            subfolder_path = os.path.join(root, dir_name)
            csv_files = [f for f in os.listdir(subfolder_path) if f.endswith('.csv') and '_all' in f]
            for csv_file in csv_files:
                csv_file_path = os.path.join(subfolder_path, csv_file)
                generate_map(csv_file_path)

# Example usage:
folder_path = "/Users/gayathri/Desktop/Format_1"
generate_maps_in_subfolders(folder_path)

  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor
  return 5 + (log_count / max_count) * scale_factor


In [34]:
import pandas as pd

def generate_domain_bart_dict(csv_file):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    
    # Initialize an empty dictionary to store domain names and corresponding BART labels
    domain_bart_dict = {}
    
    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        domain_name = row['domain_name']
        bart_labels = str(row['bart_labels'])  # Convert bart_labels to string
        
        # If domain_name already exists in the dictionary, append the new BART label
        if domain_name in domain_bart_dict:
            domain_bart_dict[domain_name].append(bart_labels)
        # If domain_name is new, create a new entry with the BART label
        else:
            domain_bart_dict[domain_name] = [bart_labels]
    
    # Convert the dictionary values to strings
    for domain, labels in domain_bart_dict.items():
        domain_bart_dict[domain] = ', '.join(labels)
    
    return domain_bart_dict

# Example usage:
csv_file = '/Users/gayathri/Desktop/domain_label_details.csv'
domain_bart_dict = generate_domain_bart_dict(csv_file)
print(domain_bart_dict)


{'Format_1': 'nan', 'thecoaster': 'business, education, all, world, environment, politics, automobile, crime, health, sports', '1057thehawk': 'business, education, all, world, environment, politics, automobile, crime, health, sports', 'telemundo47': 'all, politics', 'westmilfordmessenger': 'business, education, all, environment, world, politics, crime, health, sports', 'tristatevoice': 'all, world, politics, sports', 'wlvt': 'business, all', '943thepoint': 'business, education, miscellaneous, world, environment, all, politics, automobile, crime, health, sports', 'thedrewacorn': 'business, education, all, world, environment, politics, health, sports', 'townshipjournal': 'business, education, all, world, environment, politics, crime, health, sports', 'princetoninfo': 'business, all', 'lavocedinewyork': 'business, education, all, world, environment, politics, crime, health, sports', 'northjersey': 'business, education, miscellaneous, environment, world, all, politics, automobile, crime, h