In [1]:
import pandas as pd
import geopandas as gpd
import folium
from folium.plugins import HeatMap
from shapely import wkt
import re
import spacy
import csv
import random
from spacy.lang.en.examples import sentences 
from geopy.distance import great_circle
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random
import string
from difflib import SequenceMatcher



In [4]:
def process_csv(csv_file, output_folder):
    data = pd.read_csv(csv_file)
    grouped = data.groupby('bart_label')
    for bart_label, group in grouped:
        filename = os.path.join(output_folder, f'{os.path.splitext(os.path.basename(csv_file))[0]}_{bart_label.lower()}.csv')  
        group.to_csv(filename, index=False)

folder_path = '/Users/gayathri/Desktop/Domain_Label'
output_folder = '/Users/gayathri/Desktop/Domain_Label'
os.makedirs(output_folder, exist_ok=True)

# Iterate through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        csv_file = os.path.join(folder_path, file_name)
        process_csv(csv_file, output_folder)


In [5]:
import os

def count_files_by_prefix(folder_path):
    file_counts = {}
    for file_name in os.listdir(folder_path):
        prefix = file_name.split('_')[0]
        file_counts[prefix] = file_counts.get(prefix, 0) + 1
    for prefix, count in file_counts.items():
        print(f"{prefix}: {count} files")

folder_path = '/Users/gayathri/Desktop/Domain_Label'

count_files_by_prefix(folder_path)

920thejersey.csv: 1 files
6abc: 10 files
943thepoint: 10 files
1057thehawk: 9 files
hellenicnews: 5 files
dailyvoice: 9 files
literock969: 10 files
baristanet: 8 files
943thepoint.csv: 1 files
.DS: 1 files
dailyvoice.csv: 1 files
am970theanswer: 8 files
forward.csv: 1 files
920thejersey: 7 files
forward: 6 files
6abc.csv: 1 files
hellenicnews.csv: 1 files
baristanet.csv: 1 files
literock969.csv: 1 files
am970theanswer.csv: 1 files
1057thehawk.csv: 1 files


In [6]:
###This is a verication code to check if we could successfully generate the sub bart_label files #####
def print_unique_bart_labels(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".csv"):
            csv_file = os.path.join(folder_path, file_name)
            df = pd.read_csv(csv_file)
            unique_labels = df['bart_label'].unique()
            print(f"File: {file_name}, Total unique bart_labels: {len(unique_labels)}")
            print("Unique bart_labels:")
            for label in unique_labels:
                print(label)
            print()  

folder_path = '/Users/gayathri/Desktop/Domain_Label/main_data'

print_unique_bart_labels(folder_path)

File: 920thejersey.csv, Total unique bart_labels: 7
Unique bart_labels:
Business
World
Sports
Automobile
Environment
Crime
Health

File: 943thepoint.csv, Total unique bart_labels: 10
Unique bart_labels:
World
Business
Crime
Automobile
Environment
Health
Miscellaneous
Sports
Politics
Education

File: dailyvoice.csv, Total unique bart_labels: 9
Unique bart_labels:
Business
Automobile
Crime
World
Politics
Health
Environment
Sports
Education

File: forward.csv, Total unique bart_labels: 6
Unique bart_labels:
World
Business
Health
Politics
Crime
Sports

File: 6abc.csv, Total unique bart_labels: 10
Unique bart_labels:
World
Sports
Crime
Miscellaneous
Health
Business
Environment
Politics
Automobile
Education

File: hellenicnews.csv, Total unique bart_labels: 5
Unique bart_labels:
World
Sports
Business
Politics
Health

File: baristanet.csv, Total unique bart_labels: 8
Unique bart_labels:
World
Politics
Business
Education
Sports
Health
Environment
Crime

File: literock969.csv, Total unique bart

In [8]:
###This is to put all the files of same domain to one folder###
import os
import shutil

def organize_csv_files(folder_path):
    # Create a dictionary to store folder paths for each prefix
    folder_paths = {}
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".csv"):
            prefix = file_name.split('_')[0]
            
            if prefix not in folder_paths:
                folder_paths[prefix] = os.path.join(folder_path, prefix)
                folder_path_suffix = 0
                while os.path.exists(folder_paths[prefix]):
                    folder_path_suffix += 1
                    folder_paths[prefix] = os.path.join(folder_path, f"{prefix}_{folder_path_suffix}")
                os.makedirs(folder_paths[prefix], exist_ok=True)
            
            src_file = os.path.join(folder_path, file_name)
            dst_file = os.path.join(folder_paths[prefix], file_name)
            shutil.move(src_file, dst_file)

folder_path = '/Users/gayathri/Desktop/Domain_Label'

organize_csv_files(folder_path)

In [9]:
###Now, let's generate the pinpoint map for all the sub files of multiple domains###

In [20]:
def process_csv_file(csv_file, color):
    df = pd.read_csv(csv_file)    
    df['gpe_sum_log'] = np.log(df['gpe_sum']) / np.log(np.e)  # Calculate gpe_sum_log
    map_zoom = 10
    max_occurrences_row = df.loc[df['gpe_sum'].idxmax()]
    highest_occurrence = [highest_occurrences_latitude, highest_occurrences_longitude]
    mymap = folium.Map(location=highest_occurrence, zoom_start=map_zoom)

    max_count = df["gpe_sum_log"].max()

    def get_marker_size(log_count):
        scale_factor = 10  
        return 5 + (log_count / max_count) * scale_factor

    for index, row in df.iterrows():
        location = row["gpe"]
        latitude = row["gpe_latitude"]
        longitude = row["gpe_longitude"]
        count = row["gpe_sum"]
        log_count = row["gpe_sum_log"]
        popup_text = f"Location: {location}\nCount: {count}"
        marker_size = get_marker_size(log_count)
        folium.CircleMarker(
            [latitude, longitude],
            popup=popup_text,
            radius=marker_size,
            color=color,
            fill=True,
            fill_color=color
        ).add_to(mymap)

    filename = os.path.splitext(os.path.basename(csv_file))[0] + ".html"
    output_file = os.path.join(os.path.dirname(csv_file), filename)
    mymap.save(output_file)

folder_path = "/Users/gayathri/Desktop/Domain_Label/literock969"

color_palette = ['#e6194b', '#3cb44b', '#194fff', '#873ec7', '#de6109', 
                 '#911eb4', '#f7890a', '#f032e6', '#bcf60c', '#e62e2e']

color_index = 0
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        csv_file = os.path.join(folder_path, filename)
        color = color_palette[color_index % len(color_palette)]
        process_csv_file(csv_file, color)
        color_index += 1