In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import re
import os
import matplotlib.pyplot as plt
import nltk
from collections import OrderedDict

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to /home/riya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/riya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/riya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  "class": algorithms.Blowfish,


In [2]:
relative_path_sdg = "../../sdg_data.csv"
relative_path_attr = "../Dictionary/Attributes_2019-20.csv"
relative_string_map = "./Data_Indicator/String_map.csv"
relative_semantic = "./Data_Indicator/Semantic_map.csv"
relative_sem_threshold = "./Data_Indicator/Semantic_threshold_map.csv"
relative_rank = "./Data_Indicator/Ranking.csv"
relative_rank_unroll = "./Data_Indicator/Ranking_unroll.csv"
relative_tab_filter = "./Data_Indicator/Ranking_tab_filter.csv"
relative_chap_filter = "./Data_Indicator/Ranking_chap_filter.csv"
relative_final_map = "./Mapping_Indicator_2019-20.csv"
relative_count = "./Data_Indicator/Comparison_count.csv"


# Get the absolute path by joining the current directory with the relative path
absolute_path_sdg = os.path.normpath(os.path.join(os.getcwd(), relative_path_sdg))
absolute_path_attr = os.path.normpath(os.path.join(os.getcwd(), relative_path_attr))
absolute_path_string = os.path.normpath(os.path.join(os.getcwd(), relative_string_map))
absolute_path_semantic = os.path.normpath(os.path.join(os.getcwd(), relative_semantic))
absolute_path_sem_threshold =os.path.normpath( os.path.join(os.getcwd(), relative_sem_threshold))
absolute_path_rank =os.path.normpath( os.path.join(os.getcwd(), relative_rank))
absolute_path_rank_unroll =os.path.normpath( os.path.join(os.getcwd(), relative_rank_unroll))
absolute_path_tab_filter =os.path.normpath( os.path.join(os.getcwd(), relative_tab_filter))
absolute_path_chap_filter =os.path.normpath( os.path.join(os.getcwd(), relative_chap_filter))
absolute_path_final_map =os.path.normpath( os.path.join(os.getcwd(), relative_final_map))
absolute_path_count =os.path.normpath( os.path.join(os.getcwd(), relative_count))

In [3]:
def process_result(result_sdg, sdg_df, chapter_df):
    rows = []
    
    for _, row in result_sdg.iterrows():
        attr_id = row['Rank']
        goal_no = row['Goal No.']
        target_id = row['Target_id']
        indicator_id = row['Indicator_id']
        
        for attr in attr_id:
            rows.append({
                'Attr_id': attr,
                'top_n_goal_id': goal_no,
                'top_n_target_id': target_id,
                'top_n_indicator_id': indicator_id
            })

    all_goals = pd.DataFrame(rows)
    
    goal_name = dict(zip(sdg_df['Goal No.'], sdg_df['Goal']))
    target_name = dict(zip(sdg_df['Target_id'], sdg_df['Targets']))
    indicator_name = dict(zip(sdg_df['Indicator_id'], sdg_df['Tentative Indicators']))
    
    all_goals['Goal'] = all_goals['top_n_goal_id'].map(goal_name.get)
    all_goals['Targets'] = all_goals['top_n_target_id'].map(target_name.get)
    all_goals['Tentative Indicators'] = all_goals['top_n_indicator_id'].map(indicator_name.get)
    chapter_id = dict(zip(chapter_df['Attr_id'], chapter_df['Chapter_id']))
    chapter_name = dict(zip(chapter_df['Chapter_id'], chapter_df['Chapter_name']))
    table_id = dict(zip(chapter_df['Attr_id'], chapter_df['Table_id']))
    table_name = dict(zip(chapter_df['Table_id'], chapter_df['Table_name']))
    description = dict(zip(chapter_df['Attr_id'], chapter_df['Description']))
    all_goals['Chapter_id'] = all_goals['Attr_id'].map(chapter_id.get)
    all_goals['Chapter_name'] = all_goals['Chapter_id'].map(chapter_name.get)
    all_goals['Table_id'] = all_goals['Attr_id'].map(table_id.get)
    all_goals['Table_name'] = all_goals['Table_id'].map(table_name.get)
    all_goals['Description'] = all_goals['Attr_id'].map(description.get) 
    
                
    all_goals = all_goals.loc[:,['Attr_id','Chapter_id', 'Chapter_name', 'Table_id', 'Table_name', 'Description', 'Goal', 'Targets', 'Tentative Indicators', 'top_n_goal_id', 'top_n_target_id', 'top_n_indicator_id']]

    return all_goals


In [4]:
# Function to combine and remove duplicates from lists while maintaining order
def combine_lists_BC(row):
    list_B = row['top_n_attr_B']
    list_C = row['top_n_attr_C']
    
    # Combine lists and remove duplicates while maintaining order
    items = list(OrderedDict.fromkeys(list_B + list_C))
    return items

In [5]:
# Function to find intersection while maintaining order
def find_intersection_ABC(row):
    intersection = [x for x in row['top_n_attr_A'] if x in row['B_plus_C']]
    return intersection

In [6]:
def find_difference_A(row):
    top_n_attr = row['top_n_attr_A']
    intersection = row['A_and_BC']
    
    # Remove elements in BC and intersection from top_n_attr while preserving order
    difference = [x for x in top_n_attr if x not in intersection]
    
    return difference

In [7]:
# Function to find intersection while maintaining order
def find_intersection_BC(row):
    intersection = [x for x in row['top_n_attr_B'] if x in row['top_n_attr_C']]
    return intersection

In [8]:
def find_difference_B(row):
    top_n_attr = row['top_n_attr_B']
    intersection = set(row['A'] + row['B_and_C'] + row['A_and_BC'])
    
    # Remove elements in BC and intersection from top_n_attr while preserving order
    difference = [x for x in top_n_attr if x not in intersection]
    
    return difference

In [9]:
def find_difference_C(row):
    top_n_attr = row['top_n_attr_C']
    intersection = set(row['A'] + row['B_and_C'] + row['A_and_BC'] + row['B_minus_all'])
    
    # Remove elements in BC and intersection from top_n_attr while preserving order
    difference = [x for x in top_n_attr if x not in intersection]
    
    return difference

In [10]:
def final_rank(row):
    #combined_top_n_attr = sorted(set(list_B + list_C), key=lambda x: (list_B + list_C).index(x))
    items = row['A_and_BC'] + row['A'] + row['B_and_C'] + row['B_minus_all'] + row['C_minus_all']
    item_set = list(OrderedDict.fromkeys(items))    
    return item_set

In [11]:
sdg = pd.read_csv(absolute_path_sdg, sep=",", dtype={'Goal No.': str, 'Target_id': str, 'Indicator_id':str})

In [12]:
sdg

Unnamed: 0,Goal No.,Goal,Nodal Department,Targets,Other Related Major Departments,Tentative Indicators,Target_id,Indicator_id
0,1,End poverty in all its forms everywhere,Rural Development,"By 2030, eradicate extreme poverty for all peo...","Urban Development, Agriculture, Horticulture, ...",Proportion of the population below the interna...,1.1,1.1.1
1,1,End poverty in all its forms everywhere,Rural Development,"By 2030, reduce at least by half the proportio...","Urban Development, Agriculture, Horticulture, ...",Proportion of the population living below the ...,1.2,1.2.1
2,1,End poverty in all its forms everywhere,Rural Development,"By 2030, reduce at least by half the proportio...","Urban Development, Agriculture, Horticulture, ...","Proportion of men, women and children of all a...",1.2,1.2.2
3,1,End poverty in all its forms everywhere,Rural Development,Implement nationally appropriate social protec...,"Urban Development, Agriculture, Horticulture, ...",Percentage of the population covered by social...,1.3,1.3.1
4,1,End poverty in all its forms everywhere,Rural Development,"By 2030, ensure that all men and women, in par...","Urban Development, Agriculture, Horticulture, ...",Proportion of the population living in househo...,1.4,1.4.1
...,...,...,...,...,...,...,...,...
174,16,Strengthen the means of implementation and rev...,Finance,"Promote the development, transfer, disseminati...",Environment and Scientific Technology.,Total amount of approved funding to promote th...,16.4,16.4.1
175,16,Strengthen the means of implementation and rev...,Finance,Fully operationalize the technology bank and s...,"Environment and Scientific Technology, Inform...",Proportion of individuals using the Internet.,16.5,16.5.1
176,17,"Data, monitoring and accountability",Finance,"By 2020, enhance capacity-building support to ...","Planning, Finance, Economic and Statistics.",Proportion of sustainable development indicato...,17.1,17.1.1
177,17,"Data, monitoring and accountability",Finance,"By 2030, build on existing initiatives to deve...","Planning, Finance, Economic and Statistics.",Dollar value of all resources made available t...,17.2,17.2.1


In [13]:
sdg.drop(['Nodal Department','Other Related Major Departments'],axis=1,inplace=True)

In [14]:
# Remove duplicate rows
sdg = sdg.drop_duplicates()

In [15]:
sdg.reset_index(inplace = True, drop = True)

In [16]:
data = pd.read_csv(absolute_path_attr, sep=";", dtype={'Attr_id': str, 'Chapter_id': str, 'Table_id':str})

In [17]:
data_new = data.copy()
data_new

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description
0,3,1,General Information,1.1,Taluks Hoblies Grama Panchayath Village Accoun...,Taluks
1,4,1,General Information,1.1,Taluks Hoblies Grama Panchayath Village Accoun...,Hoblies
2,5,1,General Information,1.1,Taluks Hoblies Grama Panchayath Village Accoun...,Grama Panchayaths
3,6,1,General Information,1.1,Taluks Hoblies Grama Panchayath Village Accoun...,Circles
4,7,1,General Information,1.2,Inhabited Un-inhabited and Total Villages per ...,"Un-Inhabited Inhabited,Villages"
...,...,...,...,...,...,...
1548,1558,17,Other Information,17.3,Registration of Births and Deaths,"Total,Births,Late Registered events 2019"
1549,1559,17,Other Information,17.3,Registration of Births and Deaths,"Male,Deaths,Late Registered events 2019"
1550,1560,17,Other Information,17.3,Registration of Births and Deaths,"Female,Deaths,Late Registered events 2019"
1551,1561,17,Other Information,17.3,Registration of Births and Deaths,"Transgender,Deaths,Late Registered events 2019"


In [18]:
sdg_string = pd.read_csv(absolute_path_string, sep=";", dtype={'Goal No.': str, 'Target_id': str, 'Indicator_id':str}, converters={'top_n_attr': pd.eval, 'top_n_table_id': pd.eval, 'top_n_chapter_id': pd.eval,'top_n_similarities': pd.eval})
sdg_string

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id,new_indicators,new_indicators_str,top_n_count,top_n_attr,top_n_table_id,top_n_table_name,top_n_chapter_id,top_n_chapter_name,top_n_similarities,top_n_description
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1,"['population', 'international', 'poverty', 'li...",proportion of the population below the interna...,598,"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...","[1.6, 1.6, 1.6, 1.6, 1.6, 1.6, 1.6, 1.6, 1.6, ...",['Ration Shops and Ration Card (Below poverty ...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['General Information', 'General Information',...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","['Urban,Ration shops, Ration Shops and Ration ..."
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1,"['population', 'living', 'national', 'poverty'...",proportion of the population living below the ...,521,"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...","[17.2, 17.2, 17.2, 1.6, 1.6, 1.6, 1.6, 1.6, 1....",['Number of Pensioners under Social Security S...,"[17, 17, 17, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['Other Information', 'Other Information', 'Ot...","[4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",['Bank Post Office Account (Below poverty line...
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2,"['men', 'woman', 'child', 'age', 'living', 'po...",proportion of men women and children of all ag...,350,"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[17.2, 10.8, 10.8, 10.8, 10.8, 10.8, 10.8, 10....",['Number of Pensioners under Social Security S...,"[17, 10, 10, 10, 10, 10, 10, 10, 13, 13, 13, 1...","['Other Information', 'Education', 'Education'...","[3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",['Bank Post Office Account (Below poverty line...
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1,"['population', 'covered', 'social', 'protectio...",percentage of the population covered by social...,430,"[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 13.5, 17.2...","['Child Sex Ratio 0-6 Years', 'Child Sex Ratio...","[2, 2, 2, 2, 2, 2, 2, 13, 17, 17, 17, 2, 2, 2,...","['Area and Population', 'Area and Population',...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, ...","['Total,Child Sex Ratio 0-6 Years 2001, Child ..."
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1,"['population', 'living', 'household', 'access'...",proportion of the population living in househo...,357,"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.19,...",['Area Population Density and Households 2011 ...,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","['Area and Population', 'Area and Population',...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","['Geograpical Area Sq.Kms, Area Population Den..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1,"['amount', 'approved', 'funding', 'promote', '...",total amount of approved funding to promote th...,79,"[580, 581, 582, 583, 584, 585, 586, 587, 588, ...","[4.18, 4.18, 4.18, 4.18, 4.18, 4.18, 4.18, 4.1...","['Sericulture 2019-20', 'Sericulture 2019-20',...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 13, 13, 13, ...","['Agriculture, Horticulture & Sericulture', 'A...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['Sc,Farmers and reelers benefited under Seric..."
175,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1,"['individual', 'using', 'internet']",proportion of individuals using the internet,3,"[1022, 1383, 1384]","[9.4, 14.2, 14.2]",['Post Offices Telephone Exchanges and Telepho...,"[9, 14, 14]","['Transport and Communication', 'Rural Develop...","[1, 1, 1]","['Internet Connections,Communications in Numbe..."
176,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1,"['sustainable', 'development', 'indicator', 'p...",proportion of sustainable development indicato...,99,"[1342, 1353, 42, 46, 50, 54, 58, 62, 466, 550,...","[13.1, 13.3, 1.7, 1.7, 1.8, 1.8, 1.9, 1.9, 4.6...",['Stree Shakthi self help groups members till ...,"[13, 13, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4...","['Women & Child Development', 'Women & Child D...","[2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['% of Self help groups to state total, Stree ..."
177,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1,"['dollar', 'value', 'resource', 'made', 'avail...",dollar value of all resources made available t...,14,"[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...","[1.5, 4.1, 4.1, 4.1, 4.1, 4.15, 4.15, 4.15, 4....","['No. of Fire Stations on 31-3-2020', 'Land Ut...","[1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5]","['General Information', 'Agriculture, Horticul...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",['Value of the property protected during 2019-...


In [19]:
sdg_sim = pd.read_csv(absolute_path_semantic, sep=";", dtype={'Goal No.': str, 'Target_id': str, 'Indicator_id':str}, converters={'top_n_attr': pd.eval, 'top_n_table_id': pd.eval, 'top_n_chapter_id': pd.eval,'top_n_similarities': pd.eval})
sdg_sim

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id,new_indicators,new_indicators_str,keywords,vec,top_n_index,min_values,top_n_count,top_n_attr,top_n_chapter_id,top_n_table_id,top_n_description,top_n_chapter_name,top_n_table_name
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1,"['population', 'international', 'poverty', 'li...",proportion of the population below the interna...,"[['poverty', 'line'], ['population'], ['age', ...",[[-0.075216 0.47459 0.263355 ... -0.1...,"[67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 22, 2...","[0.8120515928906179, 1.0000000000000009, 0.794...",57,"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, ...","[2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, ...","['Geograpical Area Sq.Kms, Area Population Den...","['Area and Population', 'Area and Population',...",['Area Population Density and Households 2011 ...
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1,"['population', 'living', 'national', 'poverty'...",proportion of the population living below the ...,"[['living', 'national'], ['poverty', 'line'], ...",[[-0.219163 0.36556406 0.0332955 ... 0.1...,"[67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 21, 2...","[0.7903468200598074, 0.8120515928906179, 0.794...",50,"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 24, 2...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, ...","[2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, ...","['Geograpical Area Sq.Kms, Area Population Den...","['Area and Population', 'Area and Population',...",['Area Population Density and Households 2011 ...
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2,"['men', 'woman', 'child', 'age', 'living', 'po...",proportion of men women and children of all ag...,"[['according', 'national'], ['children', 'ages...",[[-0.173088 0.43849001 0.0401875 ... 0.1...,"[996, 1366, 1367, 1379, 1380, 1381, 1382, 1383...","[0.8887401699967143, 0.6779166506008225, 0.885...",59,"[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[9, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1, 1, ...","[9.2, 14.1, 14.1, 14.3, 14.3, 14.3, 14.3, 14.3...","['National Highway, P.w.d Road Length on 31-03...","['Transport and Communication', 'Rural Develop...",['P.w.d Road Length on 31-03-2020 In Kms. Cumu...
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1,"['population', 'covered', 'social', 'protectio...",percentage of the population covered by social...,"[['age', 'persons'], ['children'], ['populatio...",[[-0.562305 0.03955675 0.20651501 ... 0.0...,"[1214, 1215, 1216, 1217, 1218, 1219, 1220, 122...","[0.8303263734472492, 0.7450329169830372, 0.835...",58,"[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 2, 2,...","[12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12....","['Nos,Boys Hostels,Government Pre-Matric, Sche...","['Social Welfare', 'Social Welfare', 'Social W...",['Scheduled Castes Students Hostels on 31-3-20...
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1,"['population', 'living', 'household', 'access'...",proportion of the population living in househo...,"[['basic', 'services'], ['population'], ['livi...",[[-0.65325001 -0.41144501 -0.0148 ... -0.4...,"[67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 1001,...","[0.6732416697190915, 1.0000000000000009, 0.524...",56,"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1011,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 9, 9, 9, 9, ...","[2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, ...","['Geograpical Area Sq.Kms, Area Population Den...","['Area and Population', 'Area and Population',...",['Area Population Density and Households 2011 ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1,"['amount', 'approved', 'funding', 'promote', '...",total amount of approved funding to promote th...,"[['development'], ['sound', 'technologies'], [...",[[-0.10137 -0.27173999 -0.31147 ... -0.0...,"[570, 571, 572, 573, 574, 575, 576, 577, 1331,...","[1.0000000000000009, 0.47943460610968414, 0.56...",39,"[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[4, 4, 4, 4, 4, 4, 4, 4, 13, 13, 4, 4, 4, 4, 1...","[4.18, 4.18, 4.18, 4.18, 4.18, 4.18, 4.18, 4.1...","['Sc,Farmers and reelers benefited under Seric...","['Agriculture, Horticulture & Sericulture', 'A...","['Sericulture 2019-20', 'Sericulture 2019-20',..."
175,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1,"['individual', 'using', 'internet']",proportion of individuals using the internet,"[['individual'], ['using'], ['internet']]",[[-5.91870010e-01 4.99570012e-01 -1.01269998e...,"[1373, 1374, 1012, 1421, 1422, 1423, 1424, 142...","[0.5072592691296408, 0.5684845828500003, 0.567...",30,"[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[14, 14, 9, 15, 15, 15, 15, 15, 15, 15, 15, 9,...","[14.2, 14.2, 9.4, 15.4, 15.4, 15.4, 15.4, 15.4...","['2019-20 Constructed during 2019-20,Individua...","['Rural Development and Panchayat Raj', 'Rural...","['Swachh Bharat Mission Rural', 'Swachh Bharat..."
176,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1,"['sustainable', 'development', 'indicator', 'p...",proportion of sustainable development indicato...,"[['official', 'statistics'], ['level'], ['targ...",[[-0.104375 0.33179498 0.35639401 ... -0.0...,"[570, 571, 572, 573, 574, 575, 576, 577, 1331,...","[0.5132201530035716, 0.5584521024536246, 0.456...",53,"[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[4, 4, 4, 4, 4, 4, 4, 4, 13, 13, 5, 13, 13, 13...","[4.18, 4.18, 4.18, 4.18, 4.18, 4.18, 4.18, 4.1...","['Sc,Farmers and reelers benefited under Seric...","['Agriculture, Horticulture & Sericulture', 'A...","['Sericulture 2019-20', 'Sericulture 2019-20',..."
177,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1,"['dollar', 'value', 'resource', 'made', 'avail...",dollar value of all resources made available t...,"[['dollar', 'value'], ['statistical', 'capacit...",[[-0.32824999 0.2354125 -0.146439 ... -0.2...,"[20, 568, 327, 328, 329, 330, 530, 531, 532, 5...","[0.5832998005673143, 0.46257815494572907, 0.56...",51,"[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 1, ...","[1.5, 4.18, 4.1, 4.1, 4.1, 4.1, 4.15, 4.15, 4....",['Value of the property protected during 2019-...,"['General Information', 'Agriculture, Horticul...","['No. of Fire Stations on 31-3-2020', 'Sericul..."


In [20]:
sdg_threshold = pd.read_csv(absolute_path_sem_threshold, dtype={'Goal No.': str, 'Target_id': str, 'Indicator_id':str}, sep=";", converters={'top_n_attr': pd.eval, 'top_n_table_id': pd.eval, 'top_n_chapter_id': pd.eval,'top_n_similarities': pd.eval})
sdg_threshold

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id,Indicator_id,new_indicators,new_indicators_str,keywords,vec,top_n_index,top_n_count,top_n_attr,top_n_chapter_id,top_n_table_id,top_n_description,top_n_chapter_name,top_n_table_name
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1,1.1.1,"['population', 'international', 'poverty', 'li...",proportion of the population below the interna...,"[['poverty', 'line'], ['population'], ['age', ...",[[-0.075216 0.47459 0.263355 ... -0.1...,"[67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 7...",608,"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, ...","['Geograpical Area Sq.Kms, Area Population Den...","['Area and Population', 'Area and Population',...",['Area Population Density and Households 2011 ...
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2,1.2.1,"['population', 'living', 'national', 'poverty'...",proportion of the population living below the ...,"[['living', 'national'], ['poverty', 'line'], ...",[[-0.219163 0.36556406 0.0332955 ... 0.1...,"[67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 7...",520,"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, ...","['Geograpical Area Sq.Kms, Area Population Den...","['Area and Population', 'Area and Population',...",['Area Population Density and Households 2011 ...
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2,1.2.2,"['men', 'woman', 'child', 'age', 'living', 'po...",proportion of men women and children of all ag...,"[['according', 'national'], ['children', 'ages...",[[-0.173088 0.43849001 0.0401875 ... 0.1...,"[996, 1366, 1367, 1379, 1380, 1381, 1382, 1383...",641,"[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[9, 14, 14, 14, 14, 14, 14, 14, 14, 14, 17, 17...","[9.2, 14.1, 14.1, 14.3, 14.3, 14.3, 14.3, 14.3...","['National Highway, P.w.d Road Length on 31-03...","['Transport and Communication', 'Rural Develop...",['P.w.d Road Length on 31-03-2020 In Kms. Cumu...
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3,1.3.1,"['population', 'covered', 'social', 'protectio...",percentage of the population covered by social...,"[['age', 'persons'], ['children'], ['populatio...",[[-0.562305 0.03955675 0.20651501 ... 0.0...,"[1214, 1215, 1216, 1217, 1218, 1219, 1220, 122...",705,"[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1...","[12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12....","['Nos,Boys Hostels,Government Pre-Matric, Sche...","['Social Welfare', 'Social Welfare', 'Social W...",['Scheduled Castes Students Hostels on 31-3-20...
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4,1.4.1,"['population', 'living', 'household', 'access'...",proportion of the population living in househo...,"[['basic', 'services'], ['population'], ['livi...",[[-0.65325001 -0.41144501 -0.0148 ... -0.4...,"[67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 7...",716,"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, 2.1, ...","['Geograpical Area Sq.Kms, Area Population Den...","['Area and Population', 'Area and Population',...",['Area Population Density and Households 2011 ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4,16.4.1,"['amount', 'approved', 'funding', 'promote', '...",total amount of approved funding to promote th...,"[['development'], ['sound', 'technologies'], [...",[[-0.10137 -0.27173999 -0.31147 ... -0.0...,"[570, 571, 572, 573, 574, 575, 576, 577, 1331,...",348,"[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[4, 4, 4, 4, 4, 4, 4, 4, 13, 13, 13, 13, 13, 1...","[4.18, 4.18, 4.18, 4.18, 4.18, 4.18, 4.18, 4.1...","['Sc,Farmers and reelers benefited under Seric...","['Agriculture, Horticulture & Sericulture', 'A...","['Sericulture 2019-20', 'Sericulture 2019-20',..."
175,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5,16.5.1,"['individual', 'using', 'internet']",proportion of individuals using the internet,"[['individual'], ['using'], ['internet']]",[[-5.91870010e-01 4.99570012e-01 -1.01269998e...,"[1373, 1374, 1012, 1421, 1422, 1423, 1424, 142...",58,"[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[14, 14, 9, 15, 15, 15, 15, 15, 15, 15, 15, 9,...","[14.2, 14.2, 9.4, 15.4, 15.4, 15.4, 15.4, 15.4...","['2019-20 Constructed during 2019-20,Individua...","['Rural Development and Panchayat Raj', 'Rural...","['Swachh Bharat Mission Rural', 'Swachh Bharat..."
176,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1,17.1.1,"['sustainable', 'development', 'indicator', 'p...",proportion of sustainable development indicato...,"[['official', 'statistics'], ['level'], ['targ...",[[-0.104375 0.33179498 0.35639401 ... -0.0...,"[570, 571, 572, 573, 574, 575, 576, 577, 1331,...",336,"[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[4, 4, 4, 4, 4, 4, 4, 4, 13, 13, 13, 13, 13, 1...","[4.18, 4.18, 4.18, 4.18, 4.18, 4.18, 4.18, 4.1...","['Sc,Farmers and reelers benefited under Seric...","['Agriculture, Horticulture & Sericulture', 'A...","['Sericulture 2019-20', 'Sericulture 2019-20',..."
177,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2,17.2.1,"['dollar', 'value', 'resource', 'made', 'avail...",dollar value of all resources made available t...,"[['dollar', 'value'], ['statistical', 'capacit...",[[-0.32824999 0.2354125 -0.146439 ... -0.2...,"[20, 568, 327, 328, 329, 330, 530, 531, 532, 5...",287,"[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 1, ...","[1.5, 4.18, 4.1, 4.1, 4.1, 4.1, 4.15, 4.15, 4....",['Value of the property protected during 2019-...,"['General Information', 'Agriculture, Horticul...","['No. of Fire Stations on 31-3-2020', 'Sericul..."


In [21]:
result_sdg_new =sdg_string.copy()
result_sdg_sim_new =sdg_sim.copy()
result_sdg_threshold_new = sdg_threshold.copy()

In [22]:
result_sdg_new = result_sdg_new[['Goal No.', 'Target_id', 'Indicator_id', 'top_n_attr']]

In [23]:
result_sdg_new.head()

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr
0,1,1.1,1.1.1,"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4..."
1,1,1.2,1.2.1,"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,..."
2,1,1.2,1.2.2,"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107..."
3,1,1.3,1.3.1,"[117, 118, 119, 120, 121, 122, 123, 1363, 1525..."
4,1,1.4,1.4.1,"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280..."


In [24]:
result_sdg_sim_new = result_sdg_sim_new[['Goal No.', 'Target_id', 'Indicator_id', 'top_n_attr']]

In [25]:
result_sdg_sim_new.head()

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr
0,1,1.1,1.1.1,"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2..."
1,1,1.2,1.2.1,"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 24, 2..."
2,1,1.2,1.2.2,"[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139..."
3,1,1.3,1.3.1,"[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123..."
4,1,1.4,1.4.1,"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1011,..."


In [26]:
result_sdg_threshold_new = result_sdg_threshold_new[['Goal No.', 'Target_id', 'Indicator_id', 'top_n_attr']]

In [27]:
result_sdg_threshold_new.head()

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr
0,1,1.1,1.1.1,"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8..."
1,1,1.2,1.2.1,"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8..."
2,1,1.2,1.2.2,"[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139..."
3,1,1.3,1.3.1,"[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123..."
4,1,1.4,1.4.1,"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8..."


In [28]:
result_sdg_new.rename(columns = {'top_n_attr':'top_n_attr_A'}, inplace=True)

In [29]:
result_sdg_new['top_n_attr_B'] = result_sdg_threshold_new.loc[:,'top_n_attr']
result_sdg_new['top_n_attr_C'] = result_sdg_sim_new.loc[:,'top_n_attr']

In [30]:
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C
0,1,1.1,1.1.1,"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2..."
1,1,1.2,1.2.1,"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 24, 2..."
2,1,1.2,1.2.2,"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139..."
3,1,1.3,1.3.1,"[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123..."
4,1,1.4,1.4.1,"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1011,..."
...,...,...,...,...,...,...
174,16,16.4,16.4.1,"[580, 581, 582, 583, 584, 585, 586, 587, 588, ...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,..."
175,16,16.5,16.5.1,"[1022, 1383, 1384]","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143..."
176,17,17.1,17.1.1,"[1342, 1353, 42, 46, 50, 54, 58, 62, 466, 550,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,..."
177,17,17.2,17.2.1,"[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5..."


In [31]:
# Apply the function to the merged dataframe
result_sdg_new['B_plus_C'] = result_sdg_new.apply(combine_lists_BC, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C
0,1,1.1,1.1.1,"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8..."
1,1,1.2,1.2.1,"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 24, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8..."
2,1,1.2,1.2.2,"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139..."
3,1,1.3,1.3.1,"[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123..."
4,1,1.4,1.4.1,"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1011,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8..."
...,...,...,...,...,...,...,...
174,16,16.4,16.4.1,"[580, 581, 582, 583, 584, 585, 586, 587, 588, ...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,..."
175,16,16.5,16.5.1,"[1022, 1383, 1384]","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143..."
176,17,17.1,17.1.1,"[1342, 1353, 42, 46, 50, 54, 58, 62, 466, 550,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,..."
177,17,17.2,17.2.1,"[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5..."


In [32]:
# Apply the function to the merged dataframe
result_sdg_new['A_and_BC'] = result_sdg_new.apply(find_intersection_ABC, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC
0,1,1.1,1.1.1,"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4..."
1,1,1.2,1.2.1,"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 24, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,..."
2,1,1.2,1.2.2,"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107..."
3,1,1.3,1.3.1,"[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[117, 118, 119, 120, 121, 122, 123, 1363, 1525..."
4,1,1.4,1.4.1,"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1011,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280..."
...,...,...,...,...,...,...,...,...
174,16,16.4,16.4.1,"[580, 581, 582, 583, 584, 585, 586, 587, 588, ...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 588, ..."
175,16,16.5,16.5.1,"[1022, 1383, 1384]","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1022, 1383, 1384]"
176,17,17.1,17.1.1,"[1342, 1353, 42, 46, 50, 54, 58, 62, 466, 550,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[1342, 1353, 42, 46, 50, 54, 58, 62, 580, 581,..."
177,17,17.2,17.2.1,"[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 337, 338, 339, 340, 540, 541, 542, 543, 5..."


In [33]:
# Apply the function to the merged dataframe
result_sdg_new['A'] = result_sdg_new.apply(find_difference_A, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A
0,1,1.1,1.1.1,"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...",[1010]
1,1,1.2,1.2.1,"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 24, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...",[1010]
2,1,1.2,1.2.2,"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1181, 1182, 1183, 1184, 1185, 1186, 1187]"
3,1,1.3,1.3.1,"[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1168, 1169, 1379]"
4,1,1.4,1.4.1,"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1011,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[1383, 1384, 1395]"
...,...,...,...,...,...,...,...,...,...
174,16,16.4,16.4.1,"[580, 581, 582, 583, 584, 585, 586, 587, 588, ...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 588, ...",[]
175,16,16.5,16.5.1,"[1022, 1383, 1384]","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1022, 1383, 1384]",[]
176,17,17.1,17.1.1,"[1342, 1353, 42, 46, 50, 54, 58, 62, 466, 550,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[1342, 1353, 42, 46, 50, 54, 58, 62, 580, 581,...","[466, 550, 567, 578, 809, 901, 905, 912, 1005,..."
177,17,17.2,17.2.1,"[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...",[]


In [34]:
# Apply the function to the merged dataframe
result_sdg_new['B_and_C'] = result_sdg_new.apply(find_intersection_BC, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C
0,1,1.1,1.1.1,"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...",[1010],"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2..."
1,1,1.2,1.2.1,"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 24, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...",[1010],"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 98, 9..."
2,1,1.2,1.2.2,"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1181, 1182, 1183, 1184, 1185, 1186, 1187]","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139..."
3,1,1.3,1.3.1,"[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1168, 1169, 1379]","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123..."
4,1,1.4,1.4.1,"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1011,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[1383, 1384, 1395]","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 83, 9..."
...,...,...,...,...,...,...,...,...,...,...
174,16,16.4,16.4.1,"[580, 581, 582, 583, 584, 585, 586, 587, 588, ...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 588, ...",[],"[580, 581, 582, 583, 584, 585, 586, 587, 1341,..."
175,16,16.5,16.5.1,"[1022, 1383, 1384]","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1022, 1383, 1384]",[],"[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143..."
176,17,17.1,17.1.1,"[1342, 1353, 42, 46, 50, 54, 58, 62, 466, 550,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[1342, 1353, 42, 46, 50, 54, 58, 62, 580, 581,...","[466, 550, 567, 578, 809, 901, 905, 912, 1005,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,..."
177,17,17.2,17.2.1,"[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...",[],"[23, 578, 337, 338, 339, 340, 540, 541, 542, 5..."


In [35]:
# Apply the function to the merged dataframe
result_sdg_new['B_minus_all'] = result_sdg_new.apply(find_difference_B, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C,B_minus_all
0,1,1.1,1.1.1,"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...",[1010],"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[1186, 1187, 1157, 1158, 1163, 1164]"
1,1,1.2,1.2.1,"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 24, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...",[1010],"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 98, 9...",[]
2,1,1.2,1.2.2,"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1181, 1182, 1183, 1184, 1185, 1186, 1187]","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[195, 199, 203, 207, 211, 215, 219, 222, 226, ..."
3,1,1.3,1.3.1,"[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1168, 1169, 1379]","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[594, 598, 602, 606, 610, 614, 618, 622, 627, ..."
4,1,1.4,1.4.1,"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1011,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[1383, 1384, 1395]","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 83, 9...","[15, 18, 19, 20, 22, 23, 24, 25, 26, 32, 35, 3..."
...,...,...,...,...,...,...,...,...,...,...,...
174,16,16.4,16.4.1,"[580, 581, 582, 583, 584, 585, 586, 587, 588, ...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 588, ...",[],"[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[327, 328, 329, 330, 331, 332, 333, 1203, 1204..."
175,16,16.5,16.5.1,"[1022, 1383, 1384]","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1022, 1383, 1384]",[],"[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[42, 43, 44, 46, 41, 45, 1017, 360, 361, 362, ..."
176,17,17.1,17.1.1,"[1342, 1353, 42, 46, 50, 54, 58, 62, 466, 550,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[1342, 1353, 42, 46, 50, 54, 58, 62, 580, 581,...","[466, 550, 567, 578, 809, 901, 905, 912, 1005,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[1061, 1062, 1063, 1081, 1082, 1083, 1084, 108..."
177,17,17.2,17.2.1,"[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...",[],"[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[570, 579, 833, 590, 1016, 1017, 3, 4, 5, 6, 7..."


In [36]:
# Apply the function to the merged dataframe
result_sdg_new['C_minus_all'] = result_sdg_new.apply(find_difference_C, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C,B_minus_all,C_minus_all
0,1,1.1,1.1.1,"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...",[1010],"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[1186, 1187, 1157, 1158, 1163, 1164]",[]
1,1,1.2,1.2.1,"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 24, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...",[1010],"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 98, 9...",[],[]
2,1,1.2,1.2.2,"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1181, 1182, 1183, 1184, 1185, 1186, 1187]","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[195, 199, 203, 207, 211, 215, 219, 222, 226, ...",[]
3,1,1.3,1.3.1,"[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1168, 1169, 1379]","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[594, 598, 602, 606, 610, 614, 618, 622, 627, ...",[]
4,1,1.4,1.4.1,"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1011,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[1383, 1384, 1395]","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 83, 9...","[15, 18, 19, 20, 22, 23, 24, 25, 26, 32, 35, 3...",[]
...,...,...,...,...,...,...,...,...,...,...,...,...
174,16,16.4,16.4.1,"[580, 581, 582, 583, 584, 585, 586, 587, 588, ...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 588, ...",[],"[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[327, 328, 329, 330, 331, 332, 333, 1203, 1204...",[]
175,16,16.5,16.5.1,"[1022, 1383, 1384]","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1022, 1383, 1384]",[],"[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[42, 43, 44, 46, 41, 45, 1017, 360, 361, 362, ...",[]
176,17,17.1,17.1.1,"[1342, 1353, 42, 46, 50, 54, 58, 62, 466, 550,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[1342, 1353, 42, 46, 50, 54, 58, 62, 580, 581,...","[466, 550, 567, 578, 809, 901, 905, 912, 1005,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[1061, 1062, 1063, 1081, 1082, 1083, 1084, 108...",[]
177,17,17.2,17.2.1,"[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...",[],"[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[570, 579, 833, 590, 1016, 1017, 3, 4, 5, 6, 7...",[]


In [37]:
result_sdg_new.columns

Index(['Goal No.', 'Target_id', 'Indicator_id', 'top_n_attr_A', 'top_n_attr_B',
       'top_n_attr_C', 'B_plus_C', 'A_and_BC', 'A', 'B_and_C', 'B_minus_all',
       'C_minus_all'],
      dtype='object')

In [38]:
# Apply the function to the merged dataframe
result_sdg_new['Rank'] = result_sdg_new.apply(final_rank, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C,B_minus_all,C_minus_all,Rank
0,1,1.1,1.1.1,"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...",[1010],"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[1186, 1187, 1157, 1158, 1163, 1164]",[],"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4..."
1,1,1.2,1.2.1,"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 24, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...",[1010],"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 98, 9...",[],[],"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,..."
2,1,1.2,1.2.2,"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1181, 1182, 1183, 1184, 1185, 1186, 1187]","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[195, 199, 203, 207, 211, 215, 219, 222, 226, ...",[],"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107..."
3,1,1.3,1.3.1,"[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1168, 1169, 1379]","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[594, 598, 602, 606, 610, 614, 618, 622, 627, ...",[],"[117, 118, 119, 120, 121, 122, 123, 1363, 1525..."
4,1,1.4,1.4.1,"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1011,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[1383, 1384, 1395]","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 83, 9...","[15, 18, 19, 20, 22, 23, 24, 25, 26, 32, 35, 3...",[],"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,16,16.4,16.4.1,"[580, 581, 582, 583, 584, 585, 586, 587, 588, ...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 588, ...",[],"[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[327, 328, 329, 330, 331, 332, 333, 1203, 1204...",[],"[580, 581, 582, 583, 584, 585, 586, 587, 588, ..."
175,16,16.5,16.5.1,"[1022, 1383, 1384]","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1022, 1383, 1384]",[],"[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[42, 43, 44, 46, 41, 45, 1017, 360, 361, 362, ...",[],"[1022, 1383, 1384, 1431, 1432, 1433, 1434, 143..."
176,17,17.1,17.1.1,"[1342, 1353, 42, 46, 50, 54, 58, 62, 466, 550,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[1342, 1353, 42, 46, 50, 54, 58, 62, 580, 581,...","[466, 550, 567, 578, 809, 901, 905, 912, 1005,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[1061, 1062, 1063, 1081, 1082, 1083, 1084, 108...",[],"[1342, 1353, 42, 46, 50, 54, 58, 62, 580, 581,..."
177,17,17.2,17.2.1,"[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...",[],"[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[570, 579, 833, 590, 1016, 1017, 3, 4, 5, 6, 7...",[],"[23, 337, 338, 339, 340, 540, 541, 542, 543, 5..."


In [39]:
final_df = result_sdg_new.copy()

In [40]:
final_df['Attr_count'] = final_df['Rank'].apply(lambda x: len(x))

In [41]:
final_df

Unnamed: 0,Goal No.,Target_id,Indicator_id,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C,B_minus_all,C_minus_all,Rank,Attr_count
0,1,1.1,1.1.1,"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...",[1010],"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 25, 2...","[1186, 1187, 1157, 1158, 1163, 1164]",[],"[24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 4...",609
1,1,1.2,1.2.1,"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 24, 2...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...",[1010],"[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 98, 9...",[],[],"[1526, 1529, 1534, 24, 25, 26, 27, 28, 29, 30,...",521
2,1,1.2,1.2.2,"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...","[1181, 1182, 1183, 1184, 1185, 1186, 1187]","[1006, 1376, 1377, 1389, 1390, 1391, 1392, 139...","[195, 199, 203, 207, 211, 215, 219, 222, 226, ...",[],"[1526, 1064, 1065, 1066, 1067, 1068, 1069, 107...",648
3,1,1.3,1.3.1,"[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[117, 118, 119, 120, 121, 122, 123, 1363, 1525...","[1168, 1169, 1379]","[1224, 1225, 1226, 1227, 1228, 1229, 1230, 123...","[594, 598, 602, 606, 610, 614, 618, 622, 627, ...",[],"[117, 118, 119, 120, 121, 122, 123, 1363, 1525...",708
4,1,1.4,1.4.1,"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1011,...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...","[1383, 1384, 1395]","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 83, 9...","[15, 18, 19, 20, 22, 23, 24, 25, 26, 32, 35, 3...",[],"[70, 71, 72, 73, 74, 75, 76, 77, 278, 279, 280...",719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,16,16.4,16.4.1,"[580, 581, 582, 583, 584, 585, 586, 587, 588, ...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 588, ...",[],"[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[327, 328, 329, 330, 331, 332, 333, 1203, 1204...",[],"[580, 581, 582, 583, 584, 585, 586, 587, 588, ...",348
175,16,16.5,16.5.1,"[1022, 1383, 1384]","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[1022, 1383, 1384]",[],"[1383, 1384, 1022, 1431, 1432, 1433, 1434, 143...","[42, 43, 44, 46, 41, 45, 1017, 360, 361, 362, ...",[],"[1022, 1383, 1384, 1431, 1432, 1433, 1434, 143...",58
176,17,17.1,17.1.1,"[1342, 1353, 42, 46, 50, 54, 58, 62, 466, 550,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[1342, 1353, 42, 46, 50, 54, 58, 62, 580, 581,...","[466, 550, 567, 578, 809, 901, 905, 912, 1005,...","[580, 581, 582, 583, 584, 585, 586, 587, 1341,...","[1061, 1062, 1063, 1081, 1082, 1083, 1084, 108...",[],"[1342, 1353, 42, 46, 50, 54, 58, 62, 580, 581,...",349
177,17,17.2,17.2.1,"[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...",[],"[23, 578, 337, 338, 339, 340, 540, 541, 542, 5...","[570, 579, 833, 590, 1016, 1017, 3, 4, 5, 6, 7...",[],"[23, 337, 338, 339, 340, 540, 541, 542, 543, 5...",287


In [42]:
final_df.to_csv(absolute_path_rank, sep=';', index=False)

In [43]:
all_goals = process_result(final_df, sdg, data)

In [44]:
all_goals.to_csv(absolute_path_rank_unroll, sep=';', index=False)

In [45]:
all_goals

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,Targets,Tentative Indicators,top_n_goal_id,top_n_target_id,top_n_indicator_id
0,24,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Urban,Ration shops",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
1,25,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Rural,Ration shops",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
2,27,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Urban,Anthyodaya,Ration card holders",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
3,28,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Rural,Anthyodaya,Ration card holders",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
4,30,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Urban,Priority",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
...,...,...,...,...,...,...,...,...,...,...,...,...
62494,1382,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,"Women Groups,Ksdc Karnataka Skill Development ...","Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2
62495,1379,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,"Persons Skilled,Self Employment Programme,Empl...","Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2
62496,1380,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,"Groups Skilled,Self Employment Programme,Emplo...","Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2
62497,1381,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,Chief Minister Kaushalya Karnataka Yojane Cmkk...,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2


In [46]:
# Group by 'col1' and count the distinct values in 'col2'
table_map = data_new.groupby('Chapter_id')['Table_id'].nunique().to_dict()
table_map

{'1': 10,
 '10': 15,
 '11': 6,
 '12': 6,
 '13': 6,
 '14': 5,
 '15': 9,
 '16': 3,
 '17': 3,
 '2': 21,
 '3': 2,
 '4': 19,
 '5': 4,
 '6': 4,
 '7': 2,
 '8': 4,
 '9': 4}

In [47]:
attr_map = data_new.groupby('Table_id')['Attr_id'].nunique().to_dict()
attr_map

{'1.1': 4,
 '1.10': 7,
 '1.2': 3,
 '1.3': 6,
 '1.4': 4,
 '1.5': 4,
 '1.6': 15,
 '1.7': 8,
 '1.8': 8,
 '1.9': 8,
 '10.1': 9,
 '10.10': 10,
 '10.11': 11,
 '10.12': 9,
 '10.13': 24,
 '10.14': 8,
 '10.15': 8,
 '10.2': 10,
 '10.3': 4,
 '10.4': 4,
 '10.5': 4,
 '10.6': 5,
 '10.7': 5,
 '10.8': 7,
 '10.9': 10,
 '11.1': 14,
 '11.2': 16,
 '11.3': 7,
 '11.4': 5,
 '11.5': 23,
 '11.6': 8,
 '12.1': 24,
 '12.2': 18,
 '12.3': 30,
 '12.4': 6,
 '12.5': 18,
 '12.6': 21,
 '13.1': 6,
 '13.2': 4,
 '13.3': 5,
 '13.4': 5,
 '13.5': 8,
 '13.6': 7,
 '14.1': 7,
 '14.2': 6,
 '14.3': 7,
 '14.4': 7,
 '14.5': 6,
 '15.1': 7,
 '15.2': 6,
 '15.3': 9,
 '15.4': 8,
 '15.5': 5,
 '15.6': 9,
 '15.7': 10,
 '15.8': 10,
 '15.9': 5,
 '16.1': 7,
 '16.2': 16,
 '16.3': 20,
 '17.1': 4,
 '17.2': 19,
 '17.3': 19,
 '2.1': 18,
 '2.11': 20,
 '2.12': 11,
 '2.13': 34,
 '2.14': 10,
 '2.15': 10,
 '2.16': 10,
 '2.17': 10,
 '2.18': 10,
 '2.19': 10,
 '2.2': 10,
 '2.20': 10,
 '2.21': 10,
 '2.22': 10,
 '2.3': 7,
 '2.4': 12,
 '2.5': 7,
 '2.6': 10,
 

In [48]:
all_goals

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,Targets,Tentative Indicators,top_n_goal_id,top_n_target_id,top_n_indicator_id
0,24,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Urban,Ration shops",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
1,25,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Rural,Ration shops",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
2,27,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Urban,Anthyodaya,Ration card holders",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
3,28,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Rural,Anthyodaya,Ration card holders",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
4,30,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Urban,Priority",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
...,...,...,...,...,...,...,...,...,...,...,...,...
62494,1382,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,"Women Groups,Ksdc Karnataka Skill Development ...","Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2
62495,1379,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,"Persons Skilled,Self Employment Programme,Empl...","Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2
62496,1380,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,"Groups Skilled,Self Employment Programme,Emplo...","Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2
62497,1381,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,Chief Minister Kaushalya Karnataka Yojane Cmkk...,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2


In [49]:
filter_df = all_goals.copy()

In [50]:
# Initialize an empty dictionary to store the counts
tab_result = {}
table_id_rm = {}
# Iterate over unique values in col1
for val in filter_df['top_n_indicator_id'].unique():
    # Filter the dataframe by the current value of col1
    filtered_df = filter_df[filter_df['top_n_indicator_id'] == val]
    
    # Count the occurrences of each value in col2 and store in a dictionary
    table_counts = filtered_df['Table_id'].value_counts().to_dict()
    
    table_id_rm[val] = {key:value for key, value in table_counts.items() if value < (attr_map[key]/2)}
    
    table_dict_new = {key:value for key, value in table_counts.items() if key not in table_id_rm[val]}
    
    if(len(table_dict_new) <= 4):
        # Sort the dictionary based on values in descending order
        sorted_dict = dict(sorted(table_counts.items(), key=lambda x: x[1], reverse=True))
        # Keep only the top 2 elements
        top_6_dict = dict(list(sorted_dict.items())[:6])
        tab_result[val] = top_6_dict
        
    else:
        # Add the col2 counts dictionary to the result dictionary
        tab_result[val] = table_dict_new

print(tab_result)

{'1.1.1': {'4.19': 197, '2.13': 34, '2.11': 20, '2.1': 18, '1.6': 15, '2.4': 12, '2.12': 11, '2.14': 10, '2.22': 10, '2.18': 10, '2.17': 10, '2.16': 10, '2.15': 10, '2.21': 10, '2.8': 10, '2.20': 10, '2.6': 10, '2.2': 10, '2.19': 10, '1.7': 8, '7.1': 8, '1.8': 8, '1.9': 8, '9.3': 7, '14.4': 7, '11.3': 7, '2.3': 7, '14.3': 7, '14.1': 7, '10.8': 7, '2.5': 7, '1.3': 6, '13.1': 6, '15.6': 6, '14.5': 6, '2.7': 6, '14.2': 6, '10.2': 6, '10.1': 6, '2.9': 6, '15.7': 5, '15.8': 5, '15.9': 5, '13.2': 3}, '1.2.1': {'4.19': 197, '2.13': 34, '2.11': 20, '2.1': 18, '17.2': 15, '1.6': 15, '2.4': 12, '2.12': 11, '2.19': 10, '2.18': 10, '2.20': 10, '2.17': 10, '2.16': 10, '2.21': 10, '2.15': 10, '2.14': 10, '2.22': 10, '2.8': 10, '2.6': 10, '2.2': 10, '1.9': 8, '1.8': 8, '1.7': 8, '10.8': 7, '2.5': 7, '2.3': 7, '14.3': 7, '2.9': 6, '2.7': 6, '13.1': 6, '14.1': 4, '13.2': 3}, '1.2.2': {'4.19': 197, '2.13': 34, '4.2': 20, '2.11': 20, '2.1': 18, '1.6': 15, '17.2': 15, '2.4': 12, '2.12': 11, '2.2': 10, '2.

In [51]:
len(tab_result)

179

In [52]:
table_id_rm

{'1.1.1': {'17.2': 9, '4.1': 5, '11.1': 4, '17.3': 2, '6.2': 1, '9.2': 1},
 '1.2.1': {'9.2': 2},
 '1.2.2': {'6.2': 13,
  '11.5': 7,
  '6.1': 7,
  '11.1': 4,
  '17.3': 4,
  '10.2': 3,
  '10.1': 3,
  '14.1': 3,
  '10.6': 1,
  '10.7': 1,
  '10.11': 1,
  '3.1': 1,
  '1.4': 1,
  '9.2': 1,
  '15.1': 1},
 '1.3.1': {'6.2': 13,
  '11.5': 7,
  '6.1': 7,
  '17.3': 4,
  '10.1': 3,
  '10.2': 3,
  '14.1': 2,
  '11.2': 2,
  '10.6': 1,
  '10.7': 1,
  '10.11': 1,
  '14.3': 1,
  '1.4': 1},
 '1.4.1': {'6.2': 4,
  '4.1': 4,
  '7.2': 3,
  '14.3': 2,
  '14.2': 2,
  '9.1': 1,
  '7.1': 1,
  '9.4': 1,
  '3.1': 1},
 '1.5.1': {'4.1': 8,
  '9.1': 7,
  '4.18': 5,
  '6.2': 4,
  '6.1': 3,
  '11.6': 3,
  '16.3': 2,
  '15.4': 2,
  '16.2': 2,
  '15.7': 2,
  '15.8': 2,
  '8.4': 2,
  '5.1': 1,
  '15.3': 1,
  '15.5': 1,
  '6.3': 1,
  '15.9': 1},
 '1.6.1': {'4.18': 8,
  '6.2': 2,
  '11.2': 2,
  '10.12': 1,
  '4.6': 1,
  '4.16': 1,
  '4.17': 1,
  '5.1': 1,
  '15.2': 1,
  '9.1': 1,
  '9.2': 1,
  '6.3': 1},
 '2.1.1': {'17.2':

In [53]:
sum_ind=0
# Iterate over the outer dictionary
for key, inner_dict in table_id_rm.items():
    # Iterate over the inner dictionary
    for k, value in inner_dict.items():
        if k not in tab_result[key]:
            # Add the value to the result
            sum_ind += value

print(sum_ind)

8070


In [54]:
# List to store indices to remove
indices_to_remove = []
# Iterate over the rows of the DataFrame
for index, row in filter_df.iterrows():
    if row['top_n_indicator_id'] in table_id_rm:
        inner_dict = table_id_rm[row['top_n_indicator_id']]
        if row['Table_id'] in inner_dict:
            if row['Table_id'] not in tab_result[row['top_n_indicator_id']]:
                # Add index to remove list
                indices_to_remove.append(index)
            
indices_to_remove

[43,
 389,
 390,
 391,
 392,
 393,
 394,
 511,
 512,
 513,
 514,
 515,
 516,
 593,
 594,
 595,
 596,
 597,
 605,
 606,
 607,
 608,
 1103,
 1129,
 1452,
 1453,
 1454,
 1455,
 1480,
 1493,
 1545,
 1546,
 1547,
 1548,
 1549,
 1550,
 1551,
 1552,
 1553,
 1554,
 1555,
 1556,
 1557,
 1558,
 1559,
 1560,
 1561,
 1562,
 1563,
 1564,
 1565,
 1566,
 1567,
 1568,
 1569,
 1570,
 1571,
 1572,
 1573,
 1574,
 1575,
 1576,
 1577,
 1578,
 1579,
 1580,
 1581,
 1582,
 1583,
 1584,
 1767,
 1768,
 1769,
 1770,
 1771,
 2190,
 2205,
 2206,
 2207,
 2208,
 2267,
 2268,
 2269,
 2270,
 2271,
 2272,
 2273,
 2274,
 2275,
 2276,
 2277,
 2278,
 2279,
 2280,
 2281,
 2282,
 2283,
 2284,
 2285,
 2286,
 2287,
 2288,
 2289,
 2290,
 2291,
 2292,
 2293,
 2294,
 2295,
 2296,
 2297,
 2298,
 2299,
 2300,
 2301,
 2302,
 2303,
 2304,
 2305,
 2306,
 2307,
 2727,
 2728,
 2729,
 2730,
 2731,
 2732,
 2733,
 2734,
 2735,
 2816,
 2840,
 2841,
 2842,
 2848,
 2957,
 2958,
 2959,
 2960,
 2961,
 3222,
 3245,
 3246,
 3247,
 3248,
 3253,
 

In [55]:
len(indices_to_remove)

8070

In [56]:
# Remove rows using indices from indices_to_remove list
filter_df.drop(indices_to_remove, inplace=True)

filter_df

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,Targets,Tentative Indicators,top_n_goal_id,top_n_target_id,top_n_indicator_id
0,24,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Urban,Ration shops",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
1,25,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Rural,Ration shops",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
2,27,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Urban,Anthyodaya,Ration card holders",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
3,28,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Rural,Anthyodaya,Ration card holders",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
4,30,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Urban,Priority",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
...,...,...,...,...,...,...,...,...,...,...,...,...
62487,1378,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,Employment through skill training placement Es...,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2
62494,1382,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,"Women Groups,Ksdc Karnataka Skill Development ...","Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2
62495,1379,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,"Persons Skilled,Self Employment Programme,Empl...","Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2
62496,1380,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,"Groups Skilled,Self Employment Programme,Emplo...","Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2


In [57]:
filter_df.to_csv(absolute_path_tab_filter, sep=';', index=False)

In [58]:
tab_count_att = filter_df['top_n_indicator_id'].value_counts().to_dict()

In [59]:
filter_df_copy = filter_df.copy()

In [60]:
def filter_dataframe(df, n, chap_list):
    result_df = df.copy() # Initialize an empty DataFrame to store the filtered rows
    
    # Loop over each unique 'Target_id'
    for target_id, group in df.groupby('top_n_target_id'):
        num_rows = len(group)  # Count the number of rows for this 'Target_id'
        
        # Check if the number of rows is greater than the threshold 'n'
        if num_rows > n:
            idx_list = []
            drop_list = []
            for chap in chap_list:
                # Find the indices corresponding to chapters "2" and "3" for this 'Target_id'
                idx_chapter = group[group['Chapter_id'] == chap].index
                idx_list.append(idx_chapter)
                
            for item in idx_list:
                if len(item)>1:
                    if len(item)>350:
                        total_rows_to_drop = item[len(item)//3:]
                    else:
                        total_rows_to_drop = item[len(item)//2:]
                    drop_list.append(total_rows_to_drop)
            
            for i in drop_list:
                if len(i)>0:
                    result_df = result_df.drop(i)
    result_df.reset_index(inplace = True, drop = True)
    return result_df

In [61]:
chap_ll = ['2', '4', '10', '12']
# Filter the DataFrame using the function
final_map = filter_dataframe(filter_df_copy, 350, chap_ll)

In [62]:
final_map.to_csv(absolute_path_final_map, sep=';', index=False)

In [63]:
chapter_df = filter_df.copy()

In [64]:
# Initialize an empty dictionary to store the counts
chap_result = {}
chap_id_rm = {}

# Iterate over unique values in col1
for val in chapter_df['top_n_indicator_id'].unique():
    # Filter the dataframe by the current value of col1
    filtered_df = chapter_df[chapter_df['top_n_indicator_id'] == val]
    chap_list = []
    for tab in filtered_df['Table_id'].unique():
        chap_filter = filtered_df[filtered_df['Table_id'] == tab]
        
        # Count the occurrences of each value in col2 and store in a dictionary
        chap_counts = chap_filter['Chapter_id'].unique().tolist()
        
        chap_list.extend(chap_counts)
    chap_dict = pd.Series(chap_list).value_counts().to_dict()
    chap_id_rm[val] = {key:value for key, value in chap_dict.items() if value < (table_map[key]/2)}
    chap_dict_new = {key:value for key, value in chap_dict.items() if key not in chap_id_rm[val]}
    if(len(chap_dict_new) <= 1):
        # Sort the dictionary based on values in descending order
        sorted_dict = dict(sorted(chap_dict.items(), key=lambda x: x[1], reverse=True))
        # Keep only the top 2 elements
        top_4_dict = dict(list(sorted_dict.items())[:4])
        chap_result[val] = top_4_dict
 
    else:
        # Add the col2 counts dictionary to the result dictionary
        chap_result[val] = chap_dict_new

print(chap_result)

{'1.1.1': {'2': 21, '1': 5, '14': 5, '7': 1}, '1.2.1': {'2': 21, '1': 4, '14': 2, '13': 2}, '1.2.2': {'2': 21, '13': 6, '1': 6}, '1.3.1': {'2': 21, '13': 6, '12': 6}, '1.4.1': {'2': 21, '1': 10, '11': 6, '17': 3}, '1.5.1': {'2': 21, '11': 3}, '1.6.1': {'13': 6, '14': 5, '16': 3}, '2.1.1': {'2': 21, '1': 4, '11': 1, '13': 1}, '2.1.2': {'2': 21, '3': 2}, '2.2.1': {'11': 6, '13': 6, '14': 5}, '2.2.2': {'13': 6, '14': 5}, '2.3.1': {'4': 19, '8': 4}, '2.3.2': {'4': 6, '14': 2}, '2.4.1': {'2': 21, '13': 6, '14': 5, '8': 4}, '2.4.2': {'2': 21, '8': 4}, '2.4.3': {'2': 4, '8': 4, '15': 3, '4': 1}, '2.5.1': {'4': 12, '1': 5, '9': 2}, '2.5.2': {'4': 11, '1': 10, '5': 4, '17': 3, '3': 1, '7': 1}, '2.6.1': {'4': 19, '12': 3}, '2.7.1': {'8': 4, '6': 4, '16': 2}, '2.8.1': {'11': 6, '12': 6, '8': 4, '17': 2}, '2.8.2': {'1': 1, '5': 1, '16': 1, '9': 1}, '3.1.1': {'13': 6, '11': 3}, '3.1.2': {'10': 14, '11': 6, '17': 2}, '3.2.1': {'1': 4, '11': 2, '17': 1, '4': 1}, '3.2.2': {'11': 4, '1': 4, '17': 1, '4

In [65]:
chap_id_rm

{'1.1.1': {'15': 4, '10': 3, '13': 2, '4': 1, '9': 1, '11': 1},
 '1.2.1': {'1': 4, '14': 2, '13': 2, '17': 1, '4': 1, '10': 1},
 '1.2.2': {'4': 2, '11': 2, '15': 2, '17': 1, '10': 1, '14': 1, '6': 1},
 '1.3.1': {'1': 4, '17': 1, '10': 1, '11': 1, '4': 1},
 '1.4.1': {'15': 4, '10': 2, '9': 1, '6': 1, '4': 1, '5': 1},
 '1.5.1': {'4': 6, '17': 1, '5': 1, '1': 1, '9': 1},
 '1.6.1': {'1': 4, '17': 1, '4': 1, '8': 1},
 '2.1.1': {'1': 4, '11': 1, '13': 1, '4': 1, '16': 1, '17': 1},
 '2.1.2': {'1': 4, '6': 1, '4': 1, '10': 1, '9': 1},
 '2.2.1': {'2': 3, '10': 2, '12': 1, '4': 1, '1': 1, '16': 1, '17': 1},
 '2.2.2': {'2': 2,
  '10': 2,
  '11': 1,
  '12': 1,
  '4': 1,
  '1': 1,
  '16': 1,
  '17': 1},
 '2.3.1': {'2': 3, '5': 1, '6': 1, '16': 1, '12': 1},
 '2.3.2': {'4': 6, '14': 2},
 '2.4.1': {'4': 6},
 '2.4.2': {'15': 4, '10': 4, '14': 2, '1': 2, '4': 1, '16': 1, '5': 1},
 '2.4.3': {'2': 4, '15': 3, '4': 1},
 '2.5.1': {'2': 2, '10': 1, '11': 1, '5': 1, '13': 1, '8': 1, '14': 1},
 '2.5.2': {'10':

In [66]:
missing = []

# Iterate through the data dictionary
for key, value in chap_result.items():
    # Check if the sub-dictionary is empty
    if not value:
        # Add the key to the missing list
        missing.append(key)
missing

[]

In [67]:
# List to store indices to remove
chap_indices_to_remove = []
# Iterate over the rows of the DataFrame
for index, row in chapter_df.iterrows():
    if row['top_n_indicator_id'] in table_id_rm:
        inner_dict = chap_id_rm[row['top_n_indicator_id']]
        if row['Chapter_id'] in inner_dict:
            if row['top_n_indicator_id'] not in chap_result or row['Chapter_id'] not in chap_result[row['top_n_indicator_id']]:
                # Add index to remove list
                chap_indices_to_remove.append(index)

chap_indices_to_remove

[174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 211,
 212,
 213,
 214,
 215,
 216,
 217,
 218,
 219,
 220,
 221,
 222,
 223,
 224,
 225,
 226,
 227,
 228,
 229,
 230,
 231,
 232,
 233,
 234,
 235,
 236,
 237,
 238,
 239,
 240,
 241,
 242,
 243,
 244,
 245,
 246,
 247,
 248,
 249,
 250,
 251,
 252,
 253,
 254,
 255,
 256,
 257,
 258,
 259,
 260,
 261,
 262,
 263,
 264,
 265,
 266,
 267,
 268,
 269,
 270,
 271,
 272,
 273,
 274,
 275,
 276,
 277,
 278,
 279,
 280,
 281,
 282,
 283,
 284,
 285,
 286,
 287,
 288,
 289,
 290,
 291,
 292,
 293,
 294,
 295,
 296,
 297,
 298,
 299,
 300,
 301,
 302,
 303,
 304,
 305,
 306,
 307,
 308,
 309,
 310,
 311,
 312,
 313,
 314,
 315,
 316,
 317,
 318,
 319,
 320,
 321,
 322,
 323,
 324,
 325,
 326,
 327,
 328,
 329,
 330,
 331,
 332,
 333,
 334,
 335,
 336,
 337,
 338,
 339,
 340

In [68]:
len(chap_indices_to_remove)

16109

In [69]:
# Remove rows using indices from indices_to_remove list
chapter_df.drop(chap_indices_to_remove, inplace=True)

chapter_df

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,Targets,Tentative Indicators,top_n_goal_id,top_n_target_id,top_n_indicator_id
0,24,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Urban,Ration shops",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
1,25,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Rural,Ration shops",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
2,27,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Urban,Anthyodaya,Ration card holders",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
3,28,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Rural,Anthyodaya,Ration card holders",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
4,30,1,General Information,1.6,Priority Non Priority and Total Ration Card Ho...,"Urban,Priority",End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1,1.1,1.1.1
...,...,...,...,...,...,...,...,...,...,...,...,...
62487,1378,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,Employment through skill training placement Es...,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2
62494,1382,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,"Women Groups,Ksdc Karnataka Skill Development ...","Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2
62495,1379,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,"Persons Skilled,Self Employment Programme,Empl...","Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2
62496,1380,14,Rural Development and Panchayat Raj,14.1,Skill Development and Self Employment at the e...,"Groups Skilled,Self Employment Programme,Emplo...","Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Inclusive Wealth Index.,17,17.2,17.2.2


In [70]:
chapter_df['top_n_indicator_id'].nunique()

179

In [71]:
chap_count_att = chapter_df['top_n_indicator_id'].value_counts().to_dict()

In [72]:
chap_count_att

{'6.7.1': 840,
 '14.4.1': 734,
 '11.6.1': 719,
 '14.6.1': 611,
 '14.11.1': 584,
 '3.10.1': 560,
 '14.1.1': 559,
 '14.3.1': 559,
 '8.3.1': 527,
 '2.6.1': 492,
 '2.3.1': 484,
 '1.4.1': 423,
 '11.2.1': 412,
 '11.6.2': 395,
 '1.3.1': 393,
 '5.6.2': 381,
 '15.1.2': 374,
 '4.5.1': 367,
 '12.5.1': 361,
 '3.11.1': 355,
 '4.8.1': 354,
 '15.2.2': 354,
 '15.1.1': 350,
 '2.4.1': 337,
 '15.6.2': 336,
 '4.7.1': 335,
 '15.7.1': 331,
 '1.1.1': 327,
 '2.5.2': 327,
 '3.9.1': 324,
 '1.2.2': 321,
 '11.8.1': 317,
 '3.3.3': 314,
 '3.2.2': 314,
 '9.8.1': 314,
 '3.7.2': 314,
 '6.2.1': 314,
 '6.1.1': 314,
 '15.2.3': 313,
 '11.1.1': 310,
 '2.1.1': 310,
 '11.3.1': 309,
 '11.7.1': 308,
 '10.3.1': 304,
 '5.7.1': 304,
 '1.2.1': 300,
 '11.5.1': 294,
 '9.1.1': 294,
 '1.5.1': 294,
 '13.1.1': 294,
 '3.5.1': 292,
 '14.10.1': 292,
 '3.10.2': 288,
 '2.4.3': 288,
 '15.3.2': 286,
 '4.6.1': 285,
 '3.3.4': 285,
 '4.3.1': 285,
 '12.6.1': 282,
 '15.5.1': 282,
 '3.3.1': 280,
 '17.1.1': 279,
 '8.5.2': 276,
 '3.2.1': 276,
 '15.9.2

In [73]:
tab_count_att

{'6.7.1': 882,
 '11.6.1': 771,
 '14.4.1': 748,
 '1.4.1': 700,
 '9.8.1': 696,
 '3.7.2': 681,
 '14.6.1': 670,
 '1.3.1': 662,
 '4.5.1': 658,
 '11.7.1': 653,
 '2.4.1': 636,
 '11.5.1': 630,
 '1.5.1': 630,
 '13.1.1': 630,
 '3.10.1': 628,
 '2.3.1': 619,
 '2.6.1': 617,
 '14.11.1': 614,
 '2.4.2': 599,
 '12.1.1': 598,
 '1.2.2': 597,
 '1.1.1': 587,
 '15.1.3': 573,
 '15.3.2': 565,
 '11.3.1': 559,
 '14.3.1': 559,
 '14.1.1': 559,
 '2.1.2': 558,
 '8.3.1': 546,
 '2.8.1': 533,
 '2.1.1': 531,
 '5.7.1': 527,
 '6.6.1': 520,
 '1.2.1': 519,
 '11.6.2': 508,
 '2.7.1': 482,
 '11.2.1': 452,
 '15.2.3': 450,
 '5.6.2': 429,
 '14.9.1': 427,
 '3.3.3': 420,
 '3.8.1': 414,
 '4.8.1': 413,
 '15.1.1': 411,
 '4.3.1': 409,
 '15.1.2': 408,
 '2.5.2': 408,
 '15.2.2': 402,
 '10.3.1': 402,
 '15.7.1': 396,
 '12.5.1': 391,
 '4.7.1': 391,
 '9.3.2': 389,
 '3.11.1': 388,
 '6.2.1': 388,
 '8.5.2': 370,
 '15.6.2': 367,
 '1.6.1': 367,
 '3.9.1': 363,
 '6.1.1': 361,
 '11.8.1': 358,
 '16.2.1': 357,
 '11.1.1': 354,
 '12.6.1': 346,
 '3.1.2':

In [74]:
rank_count_att = all_goals['top_n_indicator_id'].value_counts().to_dict()

In [75]:
rank_filter_count_att = final_map['top_n_indicator_id'].value_counts().to_dict()

In [76]:
count_att_list = [(key, value) for key, value in chap_count_att.items()]

count_att_df = pd.DataFrame(count_att_list, columns=['Indicator_id', 'Chap_count'])
count_att_df

Unnamed: 0,Indicator_id,Chap_count
0,6.7.1,840
1,14.4.1,734
2,11.6.1,719
3,14.6.1,611
4,14.11.1,584
...,...,...
174,3.4.2,35
175,2.8.2,34
176,17.2.2,30
177,14.5.1,25


In [77]:
count_att_df['Tab_count'] = count_att_df['Indicator_id'].map(tab_count_att)

In [78]:
count_att_df['Filter_count'] = count_att_df['Indicator_id'].map(rank_filter_count_att)

In [79]:
count_att_df['Rank_count'] = count_att_df['Indicator_id'].map(rank_count_att)

In [80]:
count_att_df

Unnamed: 0,Indicator_id,Chap_count,Tab_count,Filter_count,Rank_count
0,6.7.1,840,882,475,928
1,14.4.1,734,748,373,756
2,11.6.1,719,771,449,823
3,14.6.1,611,670,355,697
4,14.11.1,584,614,282,640
...,...,...,...,...,...
174,3.4.2,35,105,105,123
175,2.8.2,34,61,59,89
176,17.2.2,30,30,30,40
177,14.5.1,25,31,31,56


In [81]:
chapter_df.to_csv(absolute_path_chap_filter, sep=';', index=False)

In [82]:
count_att_df.to_csv(absolute_path_count, sep=';', index=False)