# GLBL 6060: Final Project

## Research Questions

How have the patterns of political contributions from the agricultural sector in India shifted since 2010, specifically towards the two major national political parties (Indian National Congress and Bharatiya Janata Party)? What trends might emerge in the future regarding the nature and extent of these donations?

## Setup

In [None]:
#!pip install numpy pandas scikit-learn tensorflow matplotlib requests beautifulsoup4 tqdm folium geopandas

In [None]:
# Fundamental package for scientific computing with Python
import numpy as np

# Data analysis and manipulation tool
import pandas as pd

# Library for creating static, animated, and interactive visualizations in Python
import matplotlib.pyplot as plt

# Library to send HTTP requests for pulling data from web servers
import requests

# Module to generate a new feature matrix consisting of all polynomial combinations of the features
from sklearn.preprocessing import PolynomialFeatures

# Standardize features by removing the mean and scaling to unit variance
from sklearn.preprocessing import StandardScaler

# Utilities for cross-validation and hyperparameter tuning
from sklearn.model_selection import KFold, train_test_split, GridSearchCV

# Convert a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer

# Compute cosine similarity between samples in two sets of data
from sklearn.metrics.pairwise import cosine_similarity

# TensorFlow and Keras are used for building and training neural network models
from tensorflow.keras.models import Sequential  # Base model class for sequential layers
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization  # Different types of neural network layers
from tensorflow.keras.callbacks import EarlyStopping  # Callback to stop training when a monitored metric has stopped improving

# Scikit-learn-style wrapper for Keras models
from scikeras.wrappers import KerasRegressor

# Library for parsing HTML and XML documents
from bs4 import BeautifulSoup

# Concurrent execution module to run code in parallel, used for web scraping
from concurrent.futures import ThreadPoolExecutor, as_completed

# A library for showing progress bars in loops during execution
from tqdm import tqdm

# Library for encoding URL parameters
from urllib.parse import quote

# A package that simplifies working with geographic data
import geopandas as gpd

# A package creates interactive maps
import folium

# Library that includes advance features such as heatmaps, minimaps, timestamped layers, and more
import folium.plugins as plugins

## Data Acquisition - Web Scrapping

In [None]:
def fetch_all_donations(base_url, total_pages):
    """
    Fetches all donations from multiple pages of a website and returns them as a list of dictionaries.

    Parameters:
        base_url (str): The base URL of the website to fetch donations from.
        total_pages (int): The total number of pages containing donation information.

    Returns:
        list: A list of dictionaries containing donation information. Each dictionary represents a donation and contains the following keys:
            - 'donor' (str): The name of the donor.
            - 'address' (str): The address of the donor. If the address is not available, it is an empty string.
            - 'amount' (str): The amount of donation.
            - 'year' (str): The year in which the donation was made.
    """
    all_donations = []  # List to store all donations

    # Loop through each page to fetch donations
    for page in tqdm(range(1, total_pages + 1), desc="Fetching pages"):
        url = f"{base_url}&page={page}"  # Construct URL for current page
        
        # Send request to fetch HTML content of the page
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')  # Parse HTML using BeautifulSoup

        # Extract rows containing donation information
        rows = soup.find_all('tr')
        
        # Iterate through each row and extract donation details
        for row in rows[1:]:
            cols = row.find_all('td')
            
            # Check if there are enough columns to extract donation details
            if len(cols) >= 7:
                all_donations.append({
                    'donor': cols[1].text.strip(),
                    'address': cols[2].text.strip() if cols[2].text.strip() != 'Not Available' else '',
                    'amount': cols[3].text.strip(),
                    'year': cols[6].text.strip(),
                })
    return all_donations

In [None]:
# The base URL for fetching donation information for BJP
base_url = "https://myneta.info/party/index.php?action=all_donors&id=3"
total_pages = 194

# Fetch donations from the specified URL and total pages
donations = fetch_all_donations(base_url, total_pages)

# Write donations to a CSV file
csv_file = "BJP_donations.csv"
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=donations[0].keys())
    writer.writeheader()  # Write header row with column names
    for donation in donations:
        writer.writerow(donation)  # Write each donation as a row

# Print confirmation message
print(f"Data written to {csv_file}")

In [None]:
# The base URL for fetching donation information for INC
base_url = "https://myneta.info/party/index.php?action=all_donors&id=1"
total_pages = 54

# Fetch donations from the specified URL and total pages
donations = fetch_all_donations(base_url, total_pages)

# Write donations to a CSV file
csv_file = "INC_donations.csv"
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=donations[0].keys())
    writer.writeheader()  # Write header row with column names
    for donation in donations:
        writer.writerow(donation)  # Write each donation as a row

# Print confirmation message
print(f"Data written to {csv_file}")

## Data Cleaning and Preparation

In [None]:
# Read BJP data
df_bjp = pd.read_csv('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/BJP_data.csv')

# Read INC data
df_inc = pd.read_csv('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/INC_data.csv')

# Add 'party' column with respective party names
df_bjp['party'] = 'BJP'
df_inc['party'] = 'INC'

# Filter observations within the year 2010-2022
df_bjp = df_bjp[(df_bjp['Year'] >= 2010) & (df_bjp['Year'] <= 2022)]
df_inc = df_inc[(df_inc['Year'] >= 2010) & (df_inc['Year'] <= 2022)]

# Concatenate DataFrames
df_combined = pd.concat([df_bjp, df_inc], ignore_index=True)

# Split 'amount' column into separate columns based on space for combined DataFrame
split_columns = df_combined['amount'].str.split(' ', expand=True)

# Concatenate split columns with the original DataFrame
df_combined = pd.concat([df_combined, split_columns], axis=1)

# Columns to drop from the DataFrame
columns_to_drop = ['amount', 0, 2]

# Drop specified columns from the DataFrame
df_combined = df_combined.drop(columns=columns_to_drop, axis=1)

# Rename column '1' to 'amount'
df_combined.rename(columns={1: 'amount'}, inplace=True)

# Drop rows with NaN values in 'amount' column
df_combined.dropna(subset=['amount'], inplace=True)

# Write combined DataFrame to a new CSV file
df_combined.to_csv('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/combined_data.csv', index=False)

## Identification of Donations from Agriculutre Sector

### Web Scrapping Donor Name

In [None]:
def load_data(filepath):
    """
    Load data from a CSV file into a pandas DataFrame.

    Parameters:
        filepath (str): The file path of the CSV file.

    Returns:
        pandas.DataFrame: The loaded DataFrame containing the data.
    """
    return pd.read_csv(filepath)

def search_donor_info(donor_name, api_key):
    """
    Search for information about a donor using the ScrapingBee API.

    Parameters:
        donor_name (str): The name of the donor to search for.
        api_key (str): The API key for accessing the ScrapingBee API.

    Returns:
        list: A list containing information about the donor.
    """
    encoded_name = quote(f'"{donor_name}"')
    google_url = f"https://www.google.com/search?q={encoded_name}&num=5"
    try:
        response = requests.get(
            url="https://app.scrapingbee.com/api/v1/",
            params={
                "api_key": api_key,
                "url": google_url,
                "custom_google": "true"
            },
            timeout=10
        )
        response.raise_for_status()
        return [response.text[i:i + 20000] for i in range(0, len(response.text), 20000)]
    except requests.exceptions.HTTPError as errh:
        return [f"HTTP Error: {str(errh)}"]
    except requests.exceptions.ConnectionError as errc:
        return [f"Connection Error: {str(errc)}"]
    except requests.exceptions.Timeout as errt:
        return [f"Timeout Error: {str(errt)}"]
    except requests.exceptions.RequestException as err:
        return [f"Error: {str(err)}"]

def vector_analysis(df):
    """
    Perform vector analysis on donor information to identify agriculture-related content.

    Parameters:
        df (pandas.DataFrame): The DataFrame containing donor information.

    Returns:
        pandas.Series: Series containing similarity scores of each donor's information to agriculture-related keywords.
    """
    text = [' '.join([str(row[col]) for col in df.columns if col.startswith('donor_info')]) for _, row in df.iterrows()]
    keywords = [
        'agriculture', 'farm', 'fertilizer', 'chemical', 'pesticide', 'herbicide', 'crop', 'irrigation', 'harvest',
        'tractor', 'agronomy', 'horticulture', 'harvester', 'agricultural machinery', 'farm tools', 'organic farming',
        'sustainable agriculture', 'soil health', 'soil nutrition', 'agri-tech', 'livestock', 'poultry', 'manure',
        'agribusiness', 'agrochemical', 'farm management', 'crop rotation', 'crop protection'
    ]
    vectorizer = CountVectorizer(vocabulary=keywords)
    X = vectorizer.fit_transform(text)
    similarity_scores = cosine_similarity(X, vectorizer.transform([' '.join(keywords)]).toarray())
    return pd.Series(similarity_scores.ravel(), index=df.index)

def process_donors_concurrently(df, api_key, savepath):
    """
    Use concurrent execution to process donors in batches and delete 'donor_info' cells after vector analysis.

    Parameters:
        df (pandas.DataFrame): The DataFrame containing donor information.
        api_key (str): The API key for accessing the ScrapingBee API.
        savepath (str): The file path to save the processed data.

    Returns:
        None
    """
    batch_size = 100
    batch_data = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor, open(savepath, 'a') as f:
        future_to_donor = {executor.submit(search_donor_info, donor, api_key): donor for donor in df['donor']}
        for future in tqdm(concurrent.futures.as_completed(future_to_donor), total=len(df['donor']), desc="Processing Donors"):
            donor_name = future_to_donor[future]
            try:
                result = future.result()
                donor_data = {'donor': donor_name}
                for i, info in enumerate(result):
                    donor_data[f'donor_info {i+1}'] = info
                temp_df = pd.DataFrame([donor_data])
                temp_df['agricontentcheck'] = vector_analysis(temp_df)
                # Drop 'donor_info' columns
                temp_df.drop([col for col in temp_df.columns if col.startswith('donor_info')], axis=1, inplace=True)
                batch_data.append(temp_df)
                
                if len(batch_data) >= batch_size:
                    batch_df = pd.concat(batch_data)
                    batch_df.to_csv(f, header=f.tell()==0, index=False)
                    batch_data = []  # Reset the batch data list
            except Exception as e:
                print(f"Error processing donor {donor_name}: {e}")

        if batch_data:  # Save any remaining data that didn't fill the last batch
            batch_df = pd.concat(batch_data)
            batch_df.to_csv(f, header=f.tell()==0, index=False)

def main(filepath, savepath, api_key):
    """
    Main function to initiate the processing of donor data and save results incrementally.

    Parameters:
        filepath (str): The file path of the CSV file containing donor data.
        savepath (str): The file path to save the processed data.
        api_key (str): The API key for accessing the ScrapingBee API.

    Returns:
        None
    """
    df = load_data(filepath)
    process_donors_concurrently(df, api_key, savepath)
    print("Processing completed and data saved.")

if __name__ == "__main__":
    main('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/combined_data.csv', '/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/donor_combined_data.csv', 'AG94RNW5SQWWUMSNEOISLUP7NSIU5RLOVNHMU6QAGABOSNAUFJP2HSVGPH36DB9NVBGT8VHBP98EI7WA')

In [None]:
# Read the dataset
df = pd.read_csv('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/donor_combined_data.csv')

# Create a new column 'agri_content' based on the condition
df['agri_content'] = df['agricontentcheck'].apply(lambda x: 1 if x > 0 else 0)

# Write the modified DataFrame back to the CSV file
df.to_csv('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/updated_donor_combined_data.csv', index=False)

### Geocoding

In [None]:
def get_coordinates(address, api_key):
    """
    Get latitude and longitude coordinates for a given address using the Geoapify Geocoding API.

    Parameters:
        address (str): The address to geocode.
        api_key (str): The API key for accessing the Geoapify Geocoding API.

    Returns:
        tuple: A tuple containing latitude and longitude coordinates (latitude, longitude).
               Returns (None, None) if coordinates are not found.
    """
    url = f"https://api.geoapify.com/v1/geocode/search?text={address}&limit=1&apiKey={api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data["features"]:
            result = data["features"][0]
            latitude = result["geometry"]["coordinates"][1]
            longitude = result["geometry"]["coordinates"][0]
            return latitude, longitude
        else:
            return None, None
    else:
        return None, None

def process_addresses(filepath, savepath, api_key):
    """
    Process addresses in a CSV file by geocoding them and save the updated CSV.

    Parameters:
        filepath (str): The file path to the input CSV file.
        savepath (str): The file path to save the modified CSV file.
        api_key (str): The API key for accessing the Geoapify Geocoding API.

    Returns:
        None
    """
    df = pd.read_csv(filepath)  # Read CSV file into DataFrame
    df['latitude'] = None  # Initialize Latitude column with None values
    df['longitude'] = None  # Initialize Longitude column with None values

    # Iterate over rows and geocode addresses
    batch_size = 100
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Geocoding in Process"):
        address = row['address']
        latitude, longitude = get_coordinates(address, api_key)
        df.at[index, 'latitude'] = latitude  # Assign latitude to corresponding row
        df.at[index, 'longitude'] = longitude  # Assign longitude to corresponding row

        # Save after every batch_size rows
        if index % batch_size == 0 and index != 0:
            df.to_csv(savepath, index=False)
            print(f"Partial CSV saved at {savepath}")

    # Save the final DataFrame
    df.to_csv(savepath, index=False)  # Save updated DataFrame to CSV
    print("Final CSV saved.")

if __name__ == "__main__":
    # Run the process_addresses function with input and output file paths and API key
    process_addresses('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/updated_donor_combined_data.csv', '/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/address_donor_combined_data.csv', 'a4229f468dd4409d95c6995b770fd835')

### Identification of Farm Land using Satellite Imagery

In [None]:
def get_satellite_image(lat, lng, config):
    """
    Fetches a satellite image from the Sentinel-2 satellite data for a given latitude and longitude.

    Parameters:
        lat (float): Latitude of the location.
        lng (float): Longitude of the location.
        config (SHConfig): Configuration object for Sentinel Hub requests.

    Returns:
        array: A numpy array representing the fetched satellite image, or None if no images are available.
    """
    try:
        # Define the bounding box around the specified latitude and longitude
        bbox_coords = [lng - 0.009, lat - 0.009, lng + 0.009, lat + 0.009]
        resolution = 10  # Spatial resolution in meters
        bbox = BBox(bbox=bbox_coords, crs=CRS.WGS84)
        size = bbox_to_dimensions(bbox, resolution=resolution)

        # Sentinel Hub evalscript to fetch the Red and NIR bands
        evalscript_all_bands = """
            //VERSION=3
            function setup() {
                return {
                    input: ["B04", "B08"],  // Red and NIR bands
                    output: { bands: 2 }
                };
            }
            function evaluatePixel(sample) {
                return [sample.B04, sample.B08];
            }
        """

        # Create a Sentinel Hub request for the specified bounding box, time interval, and bands
        request = SentinelHubRequest(
            data_folder='/mnt/data',
            evalscript=evalscript_all_bands,
            input_data=[
                SentinelHubRequest.input_data(
                    data_collection=DataCollection.SENTINEL2_L2A,
                    time_interval=('2020-06-01', '2020-06-30'),
                )
            ],
            responses=[
                SentinelHubRequest.output_response('default', MimeType.TIFF)
            ],
            bbox=bbox,
            size=size,
            config=config
        )

        # Fetch the data and return the first image, if available
        images = request.get_data()
        return images[0] if images else None
    except ValueError:
        return None

def calculate_ndvi(image):
    """
    Calculates the NDVI for a given satellite image.

    Parameters:
        image (array): A numpy array representing the satellite image with Red and NIR bands.

    Returns:
        float: The average NDVI value for the image.
    """
    # Extract the Red and NIR bands as floats for calculation
    red = image[:, :, 0].astype(float)
    nir = image[:, :, 1].astype(float)
    # Calculate the NDVI, handling division by zero and invalid values
    with np.errstate(divide='ignore', invalid='ignore'):
        ndvi = (nir - red) / (nir + red)
    avg_ndvi = np.nanmean(ndvi)
    return avg_ndvi

def process_dataset(filepath, savepath, config):
    """
    Process addresses in a dataset by fetching satellite images, calculating NDVI, and updating the dataset.

    Parameters:
        filepath (str): The file path to the input dataset file.
        savepath (str): The file path to save the modified dataset file.
        config (SHConfig): Configuration object for Sentinel Hub requests.

    Returns:
        None
    """
    df = pd.read_csv(filepath)  # Read dataset into DataFrame

    # Initialize a new column to store the farm evaluation result
    df['farmland'] = 'Not Evaluated'

    # Iterate over each row in the DataFrame
    batch_size = 100
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing"):
        lat, lng = row['latitude'], row['longitude']
        image = get_satellite_image(lat, lng, config)
        if image is not None:
            avg_ndvi = calculate_ndvi(image)
            # Consider locations with an NDVI > 0.3 as farms
            df.at[index, 'farmland'] = 'Farm' if avg_ndvi > 0.3 else 'Not Farm'
        else:
            df.at[index, 'farmland'] = 'Image Not Available'

        # Save after every batch_size rows
        if index % batch_size == 0 and index != 0:
            df.to_csv(savepath, index=False)
            print(f"Partial dataset saved at {savepath}")

    # Save the final dataset
    df.to_csv(savepath, index=False)  # Save updated DataFrame to CSV
    print("Final dataset saved.")

if __name__ == "__main__":
    # Configuration for accessing the Sentinel Hub API
    config = SHConfig()
    config.sh_client_id = '7aa9c3d4-cf4a-4621-a3c5-a9774a7486fc'
    config.sh_client_secret = 'z1rgJaayLd2802nqR6XcEfY951sKrMBE'

    # Run the process_dataset function with input and output file paths and configuration
    process_dataset('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/address_donor_combined_data.csv', '/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/updated_address_donor_combined_data.csv', config)

In [None]:
# Read the updated combined dataset
df = pd.read_csv('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/Final Project/updated_address_donor_combined_data.csv')

# Create a new column 'agri_land' based on the condition
df['agri_land'] = df['farmland'].apply(lambda x: 1 if x == "Farm" else 0)

# Write the modified DataFrame to a new CSV file
df.to_csv('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/Final Project/complete_combined_data.csv', index=False)

In [None]:
# Read the combined data
df_combined = pd.read_csv('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/complete_combined_data.csv')

# Define a function to set the value of agri_sector based on the conditions
def set_agri_sector(row):
    if row['agri_content'] == 1 or row['agri_land'] == 1:
        return 1
    else:
        return 0

# Apply the function to create the agri_sector column
df_combined['agri_sector'] = df_combined.apply(lambda row: set_agri_sector(row), axis=1)

# Write the updated DataFrame to a new CSV file
df_combined.to_csv('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/final_donation_dataset.csv', index=False)

## Data Visualization

In [None]:
# Load the dataset from the specified file path
file_path = '/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/final_donations_dataset.csv'
data = pd.read_csv(file_path)

# Filtering data for the agriculture sector only
agri_data = data[data['agri_sector'] == 1]

# Grouping data by year and party, and summing donation amounts
party_yearly_agri = agri_data.groupby(['year', 'party'])['amount'].sum().unstack(fill_value=0)

# Plotting the first graph: Aggregate Donated Amount From Agriculture Sector Party-wise Over the Years
plt.figure(figsize=(10, 6))
party_yearly_agri.plot(kind='line')
plt.title('Aggregate Donated Amount From Agriculture Sector Party-wise Over the Years')
plt.xlabel('Year')
plt.ylabel('Total Donation Amount')
plt.grid(True)
plt.legend(title='Party')
plt.savefig('agri_donations_partywise.png')
plt.show()

# Grouping data by year and agriculture sector status, and summing donation amounts
sector_yearly_agri = data.groupby(['year', 'agri_sector'])['amount'].sum().unstack(fill_value=0)

# Renaming columns for clarity in the plot
sector_yearly_agri.columns = ['Non-Agriculture Sector', 'Agriculture Sector']

# Plotting the second graph: Aggregate Donated Amount From Agriculture and Non-Agriculture Sectors Over the Years
plt.figure(figsize=(10, 6))
sector_yearly_agri.plot(kind='line')
plt.title('Aggregate Donated Amount From Agriculture and Non-Agriculture Sectors Over the Years')
plt.xlabel('Year')
plt.ylabel('Total Donation Amount')
plt.grid(True)
plt.legend(title='Sector')
plt.savefig('agri_vs_non_agri_donations.png')
plt.show()

# Grouping data by year, party, and agriculture sector status, and summing donation amounts
party_sector_yearly = data.groupby(['year', 'party', 'agri_sector'])['amount'].sum().unstack(fill_value=0)

# Renaming columns for clarity in the plot
party_sector_yearly.columns = ['Non-Agriculture Sector', 'Agriculture Sector']

# Plotting the third graph: Aggregate Donated Amount From Agriculture and Non-Agriculture Sectors Party-wise Over the Years
plt.figure(figsize=(12, 8))
for party in party_sector_yearly.index.get_level_values(1).unique():
    party_data = party_sector_yearly.xs(party, level='party')
    plt.plot(party_data.index, party_data['Agriculture Sector'], label=f'{party} - Agriculture')
    plt.plot(party_data.index, party_data['Non-Agriculture Sector'], label=f'{party} - Non-Agriculture', linestyle='--')

plt.title('Aggregate Donated Amount From Agriculture and Non-Agriculture Sectors Party-wise Over the Years')
plt.xlabel('Year')
plt.ylabel('Total Donation Amount')
plt.grid(True)
plt.legend(title='Party and Sector', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('agri_vs_non_agri_partywise_donations.png')
plt.show()

## Mapping Donations Over the Years

In [None]:
# Load the data
file_path = '/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/final_donations_dataset.csv'
df = pd.read_csv(file_path)

# Filter out any rows that may have NaN values in 'latitude', 'longitude', or 'year'
df = df.dropna(subset=['latitude', 'longitude', 'year'])

# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
gdf.set_crs('EPSG:4326', inplace=True)  # Set the CRS for latitude and longitude

# Load the boundary of India
india = gpd.read_file("https://naturalearth.s3.amazonaws.com/110m_cultural/ne_110m_admin_0_countries.zip")
india = india[india.NAME == "India"]

# Clip the points to only those within India
gdf = gpd.clip(gdf, india)

# Initialize the map
india_map = folium.Map(location=[20.5937, 78.9629], zoom_start=5)

# Create an empty list to store the data for the animated map
time_indexed_data = []

unique_years = sorted(gdf['year'].unique())

for year in unique_years:
    data = gdf[gdf['year'] == year]
    features = data.apply(
        lambda row: {
            'type': 'Feature',
            'geometry': {
                'type': 'Point',
                'coordinates': [row['longitude'], row['latitude']]
            },
            'properties': {
                'time': str(year),
                'style': {
                    'color': 'blue' if row['agri_sector'] == 1 else 'yellow',
                    'radius': 10,
                },
                'icon': 'circle',
                'popup': 'Sector: Agriculture' if row['agri_sector'] == 1 else 'Sector: Non-Agriculture',
            }
        }, axis=1
    ).tolist()
    time_indexed_data.append({'type': 'FeatureCollection', 'features': features})

# Create the timestamped geoJSON
plugins.TimestampedGeoJson({
    'type': 'FeatureCollection',
    'features': sum((x['features'] for x in time_indexed_data), [])
}, period='P1Y', add_last_point=True, auto_play=True).add_to(india_map)

# Save the map to an HTML file
india_map.save('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/india_animated_map.html')
india_map

## Neural Network Prediction of Agricultural Sector Donations

In [None]:
# Global variable for polynomial features transformer
poly_transformer = PolynomialFeatures(degree=2, include_bias=False)

def load_and_preprocess_data(filepath, party):
    """
    Load and preprocess data from a CSV file, adding features for analysis.

    Parameters:
        filepath (str): The file path to the input dataset file.
        party (str): The political party for which the data is being processed ('BJP' or 'INC').

    Returns:
        tuple: A tuple containing features, target, and aggregated historical data.
    """
    global poly_transformer
    data = pd.read_csv(filepath)
    agri_data = data[data['agri_sector'] == 1]
    
    # Aggregate donation amounts by year
    aggregated_data = agri_data.groupby('year')['amount'].sum().reset_index()
    
    # Add features
    features = aggregated_data[['year']]
    features.loc[:, 'policy_year'] = features['year'].apply(lambda x: 1 if x >= 2022 else 0)
    features.loc[:, 'party_in_power'] = features['year'].apply(lambda x: 1 if x >= 2014 else 0)
    features.loc[:, 'party'] = 1 if party == 'BJP' else 0  # 1 for BJP, 0 for INC
    
    # Apply polynomial transformation to the 'year' feature
    year_poly = poly_transformer.fit_transform(features[['year']])
    feature_names = ['year'] + [f'year^{i}' for i in range(2, poly_transformer.degree + 1)]
    features_poly = pd.DataFrame(year_poly, columns=feature_names)
    features = pd.concat([features, features_poly.drop('year', axis=1)], axis=1)
    
    # Add interaction term for party_in_power and policy_year to model the impact of policies by the ruling party
    features['power_policy_interaction'] = features['party_in_power'] * features['policy_year']
    
    target = aggregated_data['amount']
    return features, target, aggregated_data

def build_model(input_shape):
    """
    Build and compile a neural network model for regression.

    Parameters:
        input_shape (int): The input shape of the model.

    Returns:
        Sequential: A compiled Keras Sequential model.
    """
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1)  # Output layer
    ])
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'mse'])
    return model

def train_and_evaluate_for_party(filepath, party):
    """
    Train and evaluate a model for a given political party.

    Parameters:
        filepath (str): The file path to the input dataset file.
        party (str): The political party for which the model is being trained ('BJP' or 'INC').

    Returns:
        tuple: A tuple containing the trained model, scaler, average loss, and historical data.
    """
    features, target, historical_data = load_and_preprocess_data(filepath, party)
    scaler = StandardScaler()
    target_normalized = scaler.fit_transform(target.values.reshape(-1, 1)).flatten()
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_no = 1
    losses = []
    
    for train, test in kfold.split(features, target_normalized):
        X_train, X_test = features.iloc[train], features.iloc[test]
        y_train, y_test = target_normalized[train], target_normalized[test]

        model = build_model(X_train.shape[1])
        model.fit(X_train, y_train, epochs=1000, batch_size=320, validation_data=(X_test, y_test),
                  verbose=0, callbacks=[EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True)])
        
        scores = model.evaluate(X_test, y_test, verbose=0)
        print(f'Party {party} - Scores for fold {fold_no}: MAE of {scores[1]}, MSE of {scores[2]}')
        losses.append(scores[0])
        fold_no += 1

    return model, scaler, np.mean(losses), historical_data

def predict_future_donations_by_party(model, scaler, start_year, policy_year, party_in_power, party):
    """
    Predict future donations for a given political party.

    Parameters:
        model (Sequential): The trained Keras model for donation prediction.
        scaler (StandardScaler): The fitted scaler object for target normalization.
        start_year (int): The starting year for predictions.
        policy_year (int): Binary indicator for policy implementation year.
        party_in_power (int): Binary indicator for party in power.
        party (str): The political party for which predictions are being made ('BJP' or 'INC').

    Returns:
        dict: A dictionary containing predicted donation amounts for the next three years.
    """
    global poly_transformer
    predictions = {}
    for year in range(start_year, start_year + 5):
        year_poly = poly_transformer.transform([[year]])
        input_features = np.concatenate((
            [year, policy_year, party_in_power, 1 if party == 'BJP' else 0],
            year_poly[0, 1:],
            [party_in_power * policy_year]  # Interaction term
        ))
        input_features = input_features.reshape(1, -1)
        predicted_normalized_amount = model.predict(input_features)
        predicted_amount = scaler.inverse_transform(predicted_normalized_amount.reshape(-1, 1))
        predictions[year] = predicted_amount[0, 0]
    return predictions

def visualize_donations(aggregated_data, predictions, party):
    """
    Visualize historical and predicted donation trends for a political party.

    Parameters:
        aggregated_data (DataFrame): The aggregated historical donation data.
        predictions (dict): A dictionary containing predicted donation amounts for the next three years.
        party (str): The political party for visualization ('BJP' or 'INC').

    Returns:
        None
    """
    plt.figure(figsize=(10, 6))
    years = aggregated_data['year']
    amounts = aggregated_data['amount']
    plt.plot(years, amounts, label=f'Actual Donations ({party})', linestyle='-')
    
    prediction_years = np.array(list(predictions.keys()))
    prediction_amounts = np.array(list(predictions.values()))
    if len(prediction_years) > 0 and len(prediction_amounts) > 0:
        plt.plot(prediction_years, prediction_amounts, label=f'Predicted Donations ({party})', linestyle='--')
    
    plt.title(f'Year-wise Donation Trends for {party}')
    plt.xlabel('Year')
    plt.ylabel('Aggregated Donation Amount')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'year-wise_donation_trend_{party}.png')
    plt.show()

parties = ['BJP', 'INC']
predictions_by_party = {}
party_in_power_status = 1  # Since BJP is in power

for party in parties:
    model, scaler, average_loss, historical_data = train_and_evaluate_for_party('/Users/rakkshetsinghaal/Desktop/Yale University/GLBL 6060/GLBL6060/Final Project - Code and Dataset/final_donations_dataset.csv', party)
    print(f'Party {party} - Average Loss: {average_loss}')
    predictions = predict_future_donations_by_party(model, scaler, 2022, 1, party_in_power_status, party)
    predictions_by_party[party] = predictions
    visualize_donations(historical_data, predictions, party)