In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import re
import os
import matplotlib.pyplot as plt
import nltk
from collections import OrderedDict

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to /home/riya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/riya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/riya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  "class": algorithms.Blowfish,


In [2]:
relative_path_sdg = "../../sdg_data.csv"
relative_path_attr = "../Dictionary/Attributes_2015-16.csv"
relative_string_map = "./Data_Goal/String_map.csv"
relative_semantic = "./Data_Goal/Semantic_map.csv"
relative_sem_threshold = "./Data_Goal/Semantic_threshold_map.csv"
relative_rank = "./Data_Goal/Ranking.csv"
relative_rank_unroll = "./Data_Goal/Ranking_unroll.csv"
relative_tab_filter = "./Data_Goal/Ranking_tab_filter.csv"
relative_chap_filter = "./Data_Goal/Ranking_chap_filter.csv"
relative_final_map = "./Mapping_Goal_2015-16.csv"
relative_count = "./Data_Goal/Comparison_count.csv"


# Get the absolute path by joining the current directory with the relative path
absolute_path_sdg = os.path.normpath(os.path.join(os.getcwd(), relative_path_sdg))
absolute_path_attr = os.path.normpath(os.path.join(os.getcwd(), relative_path_attr))
absolute_path_string = os.path.normpath(os.path.join(os.getcwd(), relative_string_map))
absolute_path_semantic = os.path.normpath(os.path.join(os.getcwd(), relative_semantic))
absolute_path_sem_threshold =os.path.normpath( os.path.join(os.getcwd(), relative_sem_threshold))
absolute_path_rank =os.path.normpath( os.path.join(os.getcwd(), relative_rank))
absolute_path_rank_unroll =os.path.normpath( os.path.join(os.getcwd(), relative_rank_unroll))
absolute_path_tab_filter =os.path.normpath( os.path.join(os.getcwd(), relative_tab_filter))
absolute_path_chap_filter =os.path.normpath( os.path.join(os.getcwd(), relative_chap_filter))
absolute_path_final_map =os.path.normpath( os.path.join(os.getcwd(), relative_final_map))
absolute_path_count =os.path.normpath( os.path.join(os.getcwd(), relative_count))

In [3]:
def process_result(result_sdg, sdg_df, chapter_df):
    rows = []
    
    for _, row in result_sdg.iterrows():
        attr_id = row['Rank']
        goal_no = row['Goal No.']
        
        for attr in attr_id:
            rows.append({
                'Attr_id': attr,
                'top_n_goal_id': goal_no
            })

    all_goals = pd.DataFrame(rows)
    
    goal_name = dict(zip(sdg_df['Goal No.'], sdg_df['Goal']))
    
    all_goals['Goal'] = all_goals['top_n_goal_id'].map(goal_name.get)
    chapter_id = dict(zip(chapter_df['Attr_id'], chapter_df['Chapter_id']))
    chapter_name = dict(zip(chapter_df['Chapter_id'], chapter_df['Chapter_name']))
    table_id = dict(zip(chapter_df['Attr_id'], chapter_df['Table_id']))
    table_name = dict(zip(chapter_df['Table_id'], chapter_df['Table_name']))
    description = dict(zip(chapter_df['Attr_id'], chapter_df['Description']))
    all_goals['Chapter_id'] = all_goals['Attr_id'].map(chapter_id.get)
    all_goals['Chapter_name'] = all_goals['Chapter_id'].map(chapter_name.get)
    all_goals['Table_id'] = all_goals['Attr_id'].map(table_id.get)
    all_goals['Table_name'] = all_goals['Table_id'].map(table_name.get)
    all_goals['Description'] = all_goals['Attr_id'].map(description.get) 
    
                
    all_goals = all_goals.loc[:,['Attr_id','Chapter_id', 'Chapter_name', 'Table_id', 'Table_name', 'Description', 'Goal', 'top_n_goal_id']]

    return all_goals


In [4]:
# Function to combine and remove duplicates from lists while maintaining order
def combine_lists_BC(row):
    list_B = row['top_n_attr_B']
    list_C = row['top_n_attr_C']
    
    # Combine lists and remove duplicates while maintaining order
    items = list(OrderedDict.fromkeys(list_B + list_C))
    return items

In [5]:
# Function to find intersection while maintaining order
def find_intersection_ABC(row):
    intersection = [x for x in row['top_n_attr_A'] if x in row['B_plus_C']]
    return intersection

In [6]:
def find_difference_A(row):
    top_n_attr = row['top_n_attr_A']
    intersection = row['A_and_BC']
    
    # Remove elements in BC and intersection from top_n_attr while preserving order
    difference = [x for x in top_n_attr if x not in intersection]
    
    return difference

In [7]:
# Function to find intersection while maintaining order
def find_intersection_BC(row):
    intersection = [x for x in row['top_n_attr_B'] if x in row['top_n_attr_C']]
    return intersection

In [8]:
def find_difference_B(row):
    top_n_attr = row['top_n_attr_B']
    intersection = set(row['A'] + row['B_and_C'] + row['A_and_BC'])
    
    # Remove elements in BC and intersection from top_n_attr while preserving order
    difference = [x for x in top_n_attr if x not in intersection]
    
    return difference

In [9]:
def find_difference_C(row):
    top_n_attr = row['top_n_attr_C']
    intersection = set(row['A'] + row['B_and_C'] + row['A_and_BC'] + row['B_minus_all'])
    
    # Remove elements in BC and intersection from top_n_attr while preserving order
    difference = [x for x in top_n_attr if x not in intersection]
    
    return difference

In [10]:
def final_rank(row):
    #combined_top_n_attr = sorted(set(list_B + list_C), key=lambda x: (list_B + list_C).index(x))
    items = row['A_and_BC'] + row['A'] + row['B_and_C'] + row['B_minus_all'] + row['C_minus_all']
    item_set = list(OrderedDict.fromkeys(items))    
    return item_set

In [11]:
sdg = pd.read_csv(absolute_path_sdg, sep=",", dtype={'Goal No.': str, 'Target_id': str, 'Indicator_id':str})

In [12]:
sdg

Unnamed: 0,Goal No.,Goal,Nodal Department,Targets,Other Related Major Departments,Tentative Indicators,Target_id,Indicator_id
0,1,End poverty in all its forms everywhere,Rural Development,"By 2030, eradicate extreme poverty for all peo...","Urban Development, Agriculture, Horticulture, ...",Proportion of the population below the interna...,1.1,1.1.1
1,1,End poverty in all its forms everywhere,Rural Development,"By 2030, reduce at least by half the proportio...","Urban Development, Agriculture, Horticulture, ...",Proportion of the population living below the ...,1.2,1.2.1
2,1,End poverty in all its forms everywhere,Rural Development,"By 2030, reduce at least by half the proportio...","Urban Development, Agriculture, Horticulture, ...","Proportion of men, women and children of all a...",1.2,1.2.2
3,1,End poverty in all its forms everywhere,Rural Development,Implement nationally appropriate social protec...,"Urban Development, Agriculture, Horticulture, ...",Percentage of the population covered by social...,1.3,1.3.1
4,1,End poverty in all its forms everywhere,Rural Development,"By 2030, ensure that all men and women, in par...","Urban Development, Agriculture, Horticulture, ...",Proportion of the population living in househo...,1.4,1.4.1
...,...,...,...,...,...,...,...,...
174,16,Strengthen the means of implementation and rev...,Finance,"Promote the development, transfer, disseminati...",Environment and Scientific Technology.,Total amount of approved funding to promote th...,16.4,16.4.1
175,16,Strengthen the means of implementation and rev...,Finance,Fully operationalize the technology bank and s...,"Environment and Scientific Technology, Inform...",Proportion of individuals using the Internet.,16.5,16.5.1
176,17,"Data, monitoring and accountability",Finance,"By 2020, enhance capacity-building support to ...","Planning, Finance, Economic and Statistics.",Proportion of sustainable development indicato...,17.1,17.1.1
177,17,"Data, monitoring and accountability",Finance,"By 2030, build on existing initiatives to deve...","Planning, Finance, Economic and Statistics.",Dollar value of all resources made available t...,17.2,17.2.1


In [13]:
sdg.drop(['Nodal Department','Other Related Major Departments', 'Tentative Indicators', 'Indicator_id', 'Targets', 'Target_id'],axis=1,inplace=True)

In [14]:
# Remove duplicate rows
sdg = sdg.drop_duplicates()

In [15]:
sdg.reset_index(inplace = True, drop = True)

In [16]:
data = pd.read_csv(absolute_path_attr, sep=";", dtype={'Attr_id': str, 'Chapter_id': str, 'Table_id':str})

In [17]:
data_new = data.copy()
data_new

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description
0,3.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,Nada Offices
1,4.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,Circles
2,5.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,Hoblies
3,6.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,Grama Panchayaths
4,7.0,1,General Information,1.1,Nada Offices Village Accountant Circles Hoblie...,No.of Taluks
...,...,...,...,...,...,...
1162,1179.0,18,Additional information,18.3,Agricultural Land Holdings And Area per 2010-1...,"Total,Number,Total Agrl. Land Holder Total"
1163,1180.0,18,Additional information,18.3,Agricultural Land Holdings And Area per 2010-1...,"Male,Area,Total Agrl. Land Holder Total"
1164,1181.0,18,Additional information,18.3,Agricultural Land Holdings And Area per 2010-1...,"Female,Area,Total Agrl. Land Holder Total"
1165,1182.0,18,Additional information,18.3,Agricultural Land Holdings And Area per 2010-1...,"Institutions,Area,Total Agrl. Land Holder Total"


In [18]:
sdg_string = pd.read_csv(absolute_path_string, sep=";", dtype={'Goal No.': str, 'Target_id': str, 'Indicator_id':str}, converters={'top_n_attr': pd.eval, 'top_n_table_id': pd.eval, 'top_n_chapter_id': pd.eval,'top_n_similarities': pd.eval})
sdg_string

Unnamed: 0,Goal No.,Goal,new_goal,new_goal_str,top_n_count,top_n_attr,top_n_table_id,top_n_table_name,top_n_chapter_id,top_n_chapter_name,top_n_similarities,top_n_description
0,1,End poverty in all its forms everywhere,"['end', 'poverty', 'form', 'everywhere']",end poverty in all its forms everywhere,18,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[1.6, 1.6, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, ...",['Ration Shops And below poverty line Ration C...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['General Information', 'General Information',...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['Urban,Ration shops, Ration Shops And below p..."
1,2,"End hunger, achieve food security and improved...","['end', 'hunger', 'achieve', 'food', 'security...",end hunger achieve food security and improved ...,210,"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...","[4.3, 4.6, 4.12, 4.1, 4.1, 4.1, 4.1, 4.1, 4.1,...","['Area under principal crops Year 2014-15', 'P...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","['Agriculture, Horticulture and Sericulture', ...","[2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['Total Food Grains,pulses Hect,Area under pri..."
2,3,Ensure healthy lives and promote well-being fo...,"['ensure', 'healthy', 'life', 'promote', 'well...",ensure healthy lives and promote well being fo...,26,"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...","[2.11, 2.11, 2.11, 2.11, 2.11, 2.11, 2.11, 2.1...",['Districtwise Population By Age Groups And Se...,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, ...","['AREA AND POPULATION', 'AREA AND POPULATION',...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['Male,0-14, Districtwise Population By Age Gr..."
3,4,Ensure inclusive and equitable quality educati...,"['ensure', 'inclusive', 'equitable', 'quality'...",ensure inclusive and equitable quality educati...,114,"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...","[10.1, 10.1, 10.1, 10.1, 10.1, 10.1, 10.1, 10....","['Literates per 2011 Census', 'Literates per 2...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","['Education', 'Education', 'Education', 'Educa...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['Male,Rural,Literates, Literates per 2011 Cen..."
4,5,Achieve gender equality and empower all women ...,"['achieve', 'gender', 'equality', 'empower', '...",achieve gender equality and empower all women ...,82,"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...","[13.6, 10.3, 10.4, 10.5, 10.8, 10.8, 10.9, 10....",['No.of Beneficiaries under different schemes ...,"[13, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","['Women Child Development', 'Education', 'Educ...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",['No.of benefitted for payment of Girl childre...
5,6,Ensure availability and sustainable management...,"['ensure', 'availability', 'sustainable', 'man...",ensure availability and sustainable management...,25,"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[1.5, 9.3, 10.9, 10.10, 14.4, 14.4, 14.4, 14.4...","['No.of Fire Stations on 31-03-2016 In Nos', '...","[1, 9, 10, 10, 14, 14, 14, 14, 14, 14, 14, 14,...","['General Information', 'Transport and Communi...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['Working Fire Brigade Water Tankers, No.of Fi..."
6,7,"Ensure access to affordable, reliable, sustain...","['ensure', 'access', 'affordable', 'reliable',...",ensure access to affordable reliable sustainab...,28,"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 15.2, 16.1, 16....",['Rural Roads Length and Number of Habitations...,"[9, 9, 9, 9, 9, 9, 15, 16, 16, 16, 16, 16, 16,...","['Transport and Communication', 'Transport and...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['Pakka,Panchayat Roads,Rural Roads Length on ..."
7,8,"Promote sustained, inclusive and sustainable e...","['promote', 'sustained', 'inclusive', 'sustain...",promote sustained inclusive and sustainable ec...,9,"[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...","[5.4, 6.3, 6.3, 14.1, 14.1, 14.3, 14.3, 14.3, ...","['Fisheries Year 2015-16', 'Small Scale Indust...","[5, 6, 6, 14, 14, 14, 14, 14, 14]","['Animal Husbandry', 'Industries', 'Industries...","[1, 1, 1, 1, 1, 1, 1, 1, 1]","['Full Time,Families involved in fisheries, Fi..."
8,9,"Build resilient infrastructure, promote inclus...","['build', 'resilient', 'infrastructure', 'prom...",build resilient infrastructure promote inclusi...,20,"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[10.9, 10.9, 10.9, 10.9, 10.9, 10.9, 10.9, 10....",['Infrastructure Facilities in Primary schools...,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","['Education', 'Education', 'Education', 'Educa...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","['No.of Primary Schools,Infrastructure Facilit..."
9,10,Reduce inequality within the State,"['reduce', 'inequality', 'state']",reduce inequality within the state,1,[520.0],[9.2],['P.w.d. Road Length on 31-03-2016'],[9],['Transport and Communication'],[1],"['State Highway,P.w.d. Road Length in Km, P.w...."


In [19]:
sdg_sim = pd.read_csv(absolute_path_semantic, sep=";", dtype={'Goal No.': str, 'Target_id': str, 'Indicator_id':str}, converters={'top_n_attr': pd.eval, 'top_n_table_id': pd.eval, 'top_n_chapter_id': pd.eval,'top_n_similarities': pd.eval})
sdg_sim

Unnamed: 0,Goal No.,Goal,new_goal,new_goal_str,keywords,vec,top_n_index,min_values,top_n_count,top_n_attr,top_n_chapter_id,top_n_table_id,top_n_description,top_n_chapter_name,top_n_table_name
0,1,End poverty in all its forms everywhere,"['end', 'poverty', 'form', 'everywhere']",end poverty in all its forms everywhere,"[['end', 'poverty'], ['end'], ['forms', 'every...",[[ 0.004565 0.380225 0.28172 ... -0.0...,"[25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 381, ...","[0.8735124280076773, 0.5514384505494149, 0.512...",37,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 3, 5, ...","[1.6, 1.6, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, ...","['Urban,Ration shops, Ration Shops And below p...","['General Information', 'General Information',...",['Ration Shops And below poverty line Ration C...
1,2,"End hunger, achieve food security and improved...","['end', 'hunger', 'achieve', 'food', 'security...",end hunger achieve food security and improved ...,"[['sustainable', 'agriculture'], ['security', ...",[[ 0.079023 0.0409187 -0.25506 ... -0.3...,"[199, 232, 286, 418, 419, 148, 150, 151, 152, ...","[0.8411997113893511, 0.5815403183778527, 0.566...",58,"[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[4, 4, 4, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[4.3, 4.6, 4.12, 6.4, 6.4, 4.1, 4.1, 4.1, 4.1,...","['Total Food Grains,pulses Hect,Area under pri...","['Agriculture, Horticulture and Sericulture', ...","['Area under principal crops Year 2014-15', 'P..."
2,3,Ensure healthy lives and promote well-being fo...,"['ensure', 'healthy', 'life', 'promote', 'well...",ensure healthy lives and promote well being fo...,"[['well'], ['well', 'ages'], ['ensure'], ['pro...",[[-0.13508999 0.35907 0.1453 ... -0.4...,"[169, 170, 171, 172, 173, 174, 882, 883, 903, ...","[0.7045711183911056, 0.6798916546022344, 0.527...",46,"[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[4, 4, 4, 4, 4, 4, 14, 14, 15, 9, 10, 11, 11, ...","[4.2, 4.2, 4.2, 4.2, 4.2, 4.2, 14.5, 14.5, 15....","['Nos,Wells,2014-15 Net area irrigated Hectare...","['Agriculture, Horticulture and Sericulture', ...",['Gross and Net area Irrigated under Different...
3,4,Ensure inclusive and equitable quality educati...,"['ensure', 'inclusive', 'equitable', 'quality'...",ensure inclusive and equitable quality educati...,"[['education'], ['ensure', 'inclusive'], ['pro...",[[-0.15599 -0.48863 -0.095793 ... -0.7...,"[526, 527, 529, 530, 534, 535, 536, 538, 539, ...","[1.0000000000000002, 0.41478819432196234, 0.50...",54,"[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 1...","[10.1, 10.1, 10.1, 10.1, 10.1, 10.2, 10.2, 10....","['Male,Rural,Literates, Literates per 2011 Cen...","['Education', 'Education', 'Education', 'Educa...","['Literates per 2011 Census', 'Literates per 2..."
4,5,Achieve gender equality and empower all women ...,"['achieve', 'gender', 'equality', 'empower', '...",achieve gender equality and empower all women ...,"[['women'], ['equality', 'empower'], ['equalit...",[[-0.43895 0.47641999 0.18844 ... 0.2...,"[594, 869, 547, 551, 555, 568, 571, 575, 585, ...","[0.6183490921829581, 0.411797604971018, 0.4003...",55,"[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[10, 14, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[10.11, 14.3, 10.3, 10.4, 10.5, 10.8, 10.8, 10...","['Men,No. of Lecturers, No.of P U Colleges Stu...","['Education', 'Rural Development & Panchayat R...",['No.of P U Colleges Students and Lecturers Ye...
5,6,Ensure availability and sustainable management...,"['ensure', 'availability', 'sustainable', 'man...",ensure availability and sustainable management...,"[['ensure', 'availability'], ['ensure'], ['wat...",[[ 0.21560501 -0.03204 0.1458025 ... -0.0...,"[22, 516, 580, 590, 876, 877, 878, 879, 880, 8...","[0.5454543688980285, 0.5270143992244825, 1.000...",43,"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[1, 9, 10, 10, 14, 14, 14, 14, 14, 14, 9, 9, 9...","[1.5, 9.3, 10.9, 10.10, 14.4, 14.4, 14.4, 14.4...","['Working Fire Brigade Water Tankers, No.of Fi...","['General Information', 'Transport and Communi...","['No.of Fire Stations on 31-03-2016 In Nos', '..."
6,7,"Ensure access to affordable, reliable, sustain...","['ensure', 'access', 'affordable', 'reliable',...",ensure access to affordable reliable sustainab...,"[['modern', 'energy'], ['ensure'], ['affordabl...",[[ 0.45836502 -0.1715995 -0.23113549 ... -0.4...,"[515, 516, 517, 518, 519, 520, 898, 930, 931, ...","[0.8285528169164933, 0.5270143992244825, 0.471...",54,"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[9, 9, 9, 9, 9, 9, 15, 16, 16, 16, 16, 16, 16,...","[9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 15.2, 16.1, 16....","['Pakka,Panchayat Roads,Rural Roads Length on ...","['Transport and Communication', 'Transport and...",['Rural Roads Length and Number of Habitations...
7,8,"Promote sustained, inclusive and sustainable e...","['promote', 'sustained', 'inclusive', 'sustain...",promote sustained inclusive and sustainable ec...,"[['full', 'productive'], ['promote', 'sustaine...",[[-2.96179995e-01 2.15652004e-01 -2.02702441e...,"[408, 410, 861, 862, 872, 873, 874, 875, 381, ...","[0.5318692418751239, 0.5479780674858141, 0.703...",56,"[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[6, 6, 14, 14, 14, 14, 14, 14, 5, 9, 9, 9, 9, ...","[6.3, 6.3, 14.1, 14.1, 14.3, 14.3, 14.3, 14.3,...","['Employment,Current year 2015-16, Small Scale...","['Industries', 'Industries', 'Rural Developmen...",['Small Scale Industrial Units Setup Year 2015...
8,9,"Build resilient infrastructure, promote inclus...","['build', 'resilient', 'infrastructure', 'prom...",build resilient infrastructure promote inclusi...,"[['promote'], ['inclusive', 'sustainable'], ['...",[[ 0.045182 0.069974 -0.61168998 ... 0.1...,"[573, 574, 575, 576, 577, 578, 579, 580, 581, ...","[0.5658785348581976, 0.4676713963912552, 0.526...",25,"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 13, 1...","[10.9, 10.9, 10.9, 10.9, 10.9, 10.9, 10.9, 10....","['No.of Primary Schools,Infrastructure Facilit...","['Education', 'Education', 'Education', 'Educa...",['Infrastructure Facilities in Primary schools...
9,10,Reduce inequality within the State,"['reduce', 'inequality', 'state']",reduce inequality within the state,"[['reduce'], ['inequality'], ['state']]",[[ 2.19090000e-01 3.54759991e-01 9.81779955e...,"[512, 832, 833, 834, 835, 836, 53, 54, 49, 50,...","[0.5343018032154383, 0.5327294572224268, 0.533...",30,"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[9, 13, 13, 13, 13, 13, 1, 1, 1, 1, 1, 1, 11, ...","[9.2, 13.1, 13.1, 13.1, 13.1, 13.1, 1.10, 1.10...","['State Highway,P.w.d. Road Length in Km, P.w....","['Transport and Communication', 'Women Child D...","['P.w.d. Road Length on 31-03-2016', 'Stree Sh..."


In [20]:
sdg_threshold = pd.read_csv(absolute_path_sem_threshold, dtype={'Goal No.': str, 'Target_id': str, 'Indicator_id':str}, sep=";", converters={'top_n_attr': pd.eval, 'top_n_table_id': pd.eval, 'top_n_chapter_id': pd.eval,'top_n_similarities': pd.eval})
sdg_threshold

Unnamed: 0,Goal No.,Goal,new_goal,new_goal_str,keywords,vec,top_n_index,top_n_count,top_n_attr,top_n_chapter_id,top_n_table_id,top_n_description,top_n_chapter_name,top_n_table_name
0,1,End poverty in all its forms everywhere,"['end', 'poverty', 'form', 'everywhere']",end poverty in all its forms everywhere,"[['end', 'poverty'], ['end'], ['forms', 'every...",[[ 0.004565 0.380225 0.28172 ... -0.0...,"[25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 3...",75,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1.6, 1.6, 1.6, 1.6, 1.6, 1.6, 1.7, 1.7, 1.7, ...","['Urban,Ration shops, Ration Shops And below p...","['General Information', 'General Information',...",['Ration Shops And below poverty line Ration C...
1,2,"End hunger, achieve food security and improved...","['end', 'hunger', 'achieve', 'food', 'security...",end hunger achieve food security and improved ...,"[['sustainable', 'agriculture'], ['security', ...",[[ 0.079023 0.0409187 -0.25506 ... -0.3...,"[199, 232, 286, 418, 419, 148, 150, 151, 152, ...",362,"[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[4, 4, 4, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[4.3, 4.6, 4.12, 6.4, 6.4, 4.1, 4.1, 4.1, 4.1,...","['Total Food Grains,pulses Hect,Area under pri...","['Agriculture, Horticulture and Sericulture', ...","['Area under principal crops Year 2014-15', 'P..."
2,3,Ensure healthy lives and promote well-being fo...,"['ensure', 'healthy', 'life', 'promote', 'well...",ensure healthy lives and promote well being fo...,"[['well'], ['well', 'ages'], ['ensure'], ['pro...",[[-0.13508999 0.35907 0.1453 ... -0.4...,"[169, 170, 171, 172, 173, 174, 882, 883, 903, ...",186,"[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[4, 4, 4, 4, 4, 4, 14, 14, 15, 9, 9, 9, 9, 9, ...","[4.2, 4.2, 4.2, 4.2, 4.2, 4.2, 14.5, 14.5, 15....","['Nos,Wells,2014-15 Net area irrigated Hectare...","['Agriculture, Horticulture and Sericulture', ...",['Gross and Net area Irrigated under Different...
3,4,Ensure inclusive and equitable quality educati...,"['ensure', 'inclusive', 'equitable', 'quality'...",ensure inclusive and equitable quality educati...,"[['education'], ['ensure', 'inclusive'], ['pro...",[[-0.15599 -0.48863 -0.095793 ... -0.7...,"[526, 527, 529, 530, 534, 535, 536, 538, 539, ...",341,"[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[10.1, 10.1, 10.1, 10.1, 10.1, 10.2, 10.2, 10....","['Male,Rural,Literates, Literates per 2011 Cen...","['Education', 'Education', 'Education', 'Educa...","['Literates per 2011 Census', 'Literates per 2..."
4,5,Achieve gender equality and empower all women ...,"['achieve', 'gender', 'equality', 'empower', '...",achieve gender equality and empower all women ...,"[['women'], ['equality', 'empower'], ['equalit...",[[-0.43895 0.47641999 0.18844 ... 0.2...,"[594, 869, 547, 551, 555, 568, 571, 575, 585, ...",388,"[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[10, 14, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[10.11, 14.3, 10.3, 10.4, 10.5, 10.8, 10.8, 10...","['Men,No. of Lecturers, No.of P U Colleges Stu...","['Education', 'Rural Development & Panchayat R...",['No.of P U Colleges Students and Lecturers Ye...
5,6,Ensure availability and sustainable management...,"['ensure', 'availability', 'sustainable', 'man...",ensure availability and sustainable management...,"[['ensure', 'availability'], ['ensure'], ['wat...",[[ 0.21560501 -0.03204 0.1458025 ... -0.0...,"[22, 516, 580, 590, 876, 877, 878, 879, 880, 8...",211,"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[1, 9, 10, 10, 14, 14, 14, 14, 14, 14, 14, 14,...","[1.5, 9.3, 10.9, 10.10, 14.4, 14.4, 14.4, 14.4...","['Working Fire Brigade Water Tankers, No.of Fi...","['General Information', 'Transport and Communi...","['No.of Fire Stations on 31-03-2016 In Nos', '..."
6,7,"Ensure access to affordable, reliable, sustain...","['ensure', 'access', 'affordable', 'reliable',...",ensure access to affordable reliable sustainab...,"[['modern', 'energy'], ['ensure'], ['affordabl...",[[ 0.45836502 -0.1715995 -0.23113549 ... -0.4...,"[515, 516, 517, 518, 519, 520, 898, 930, 931, ...",395,"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[9, 9, 9, 9, 9, 9, 15, 16, 16, 16, 16, 16, 16,...","[9.3, 9.3, 9.3, 9.3, 9.3, 9.3, 15.2, 16.1, 16....","['Pakka,Panchayat Roads,Rural Roads Length on ...","['Transport and Communication', 'Transport and...",['Rural Roads Length and Number of Habitations...
7,8,"Promote sustained, inclusive and sustainable e...","['promote', 'sustained', 'inclusive', 'sustain...",promote sustained inclusive and sustainable ec...,"[['full', 'productive'], ['promote', 'sustaine...",[[-2.96179995e-01 2.15652004e-01 -2.02702441e...,"[408, 410, 861, 862, 872, 873, 874, 875, 381, ...",118,"[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[6, 6, 14, 14, 14, 14, 14, 14, 5, 9, 9, 9, 9, ...","[6.3, 6.3, 14.1, 14.1, 14.3, 14.3, 14.3, 14.3,...","['Employment,Current year 2015-16, Small Scale...","['Industries', 'Industries', 'Rural Developmen...",['Small Scale Industrial Units Setup Year 2015...
8,9,"Build resilient infrastructure, promote inclus...","['build', 'resilient', 'infrastructure', 'prom...",build resilient infrastructure promote inclusi...,"[['promote'], ['inclusive', 'sustainable'], ['...",[[ 0.045182 0.069974 -0.61168998 ... 0.1...,"[573, 574, 575, 576, 577, 578, 579, 580, 581, ...",76,"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[10.9, 10.9, 10.9, 10.9, 10.9, 10.9, 10.9, 10....","['No.of Primary Schools,Infrastructure Facilit...","['Education', 'Education', 'Education', 'Educa...",['Infrastructure Facilities in Primary schools...
9,10,Reduce inequality within the State,"['reduce', 'inequality', 'state']",reduce inequality within the state,"[['reduce'], ['inequality'], ['state']]",[[ 2.19090000e-01 3.54759991e-01 9.81779955e...,"[512, 832, 833, 834, 835, 836, 53, 54, 49, 50,...",125,"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[9, 13, 13, 13, 13, 13, 1, 1, 1, 1, 1, 1, 11, ...","[9.2, 13.1, 13.1, 13.1, 13.1, 13.1, 1.10, 1.10...","['State Highway,P.w.d. Road Length in Km, P.w....","['Transport and Communication', 'Women Child D...","['P.w.d. Road Length on 31-03-2016', 'Stree Sh..."


In [21]:
result_sdg_new =sdg_string.copy()
result_sdg_sim_new =sdg_sim.copy()
result_sdg_threshold_new = sdg_threshold.copy()

In [22]:
result_sdg_new = result_sdg_new[['Goal No.', 'top_n_attr']]

In [23]:
result_sdg_new.head()

Unnamed: 0,Goal No.,top_n_attr
0,1,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35...."
1,2,"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159..."
2,3,"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126..."
3,4,"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540..."
4,5,"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583..."


In [24]:
result_sdg_sim_new = result_sdg_sim_new[['Goal No.', 'top_n_attr']]

In [25]:
result_sdg_sim_new.head()

Unnamed: 0,Goal No.,top_n_attr
0,1,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35...."
1,2,"[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158..."
2,3,"[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889..."
3,4,"[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544..."
4,5,"[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579..."


In [26]:
result_sdg_threshold_new = result_sdg_threshold_new[['Goal No.', 'top_n_attr']]

In [27]:
result_sdg_threshold_new.head()

Unnamed: 0,Goal No.,top_n_attr
0,1,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35...."
1,2,"[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158..."
2,3,"[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889..."
3,4,"[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544..."
4,5,"[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579..."


In [28]:
result_sdg_new.rename(columns = {'top_n_attr':'top_n_attr_A'}, inplace=True)

In [29]:
result_sdg_new['top_n_attr_B'] = result_sdg_threshold_new.loc[:,'top_n_attr']
result_sdg_new['top_n_attr_C'] = result_sdg_sim_new.loc[:,'top_n_attr']

In [30]:
result_sdg_new

Unnamed: 0,Goal No.,top_n_attr_A,top_n_attr_B,top_n_attr_C
0,1,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35...."
1,2,"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158..."
2,3,"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889..."
3,4,"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544..."
4,5,"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579..."
5,6,"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885...."
6,7,"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905..."
7,8,"[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881..."
8,9,"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587..."
9,10,[520.0],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56...."


In [31]:
# Apply the function to the merged dataframe
result_sdg_new['B_plus_C'] = result_sdg_new.apply(combine_lists_BC, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C
0,1,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35...."
1,2,"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158..."
2,3,"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889..."
3,4,"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544..."
4,5,"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579..."
5,6,"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885...."
6,7,"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905..."
7,8,"[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881..."
8,9,"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587..."
9,10,[520.0],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56...."


In [32]:
# Apply the function to the merged dataframe
result_sdg_new['A_and_BC'] = result_sdg_new.apply(find_intersection_ABC, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC
0,1,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35...."
1,2,"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159..."
2,3,"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126..."
3,4,"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540..."
4,5,"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583..."
5,6,"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885...."
6,7,"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905..."
7,8,"[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880..."
8,9,"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587..."
9,10,[520.0],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....",[520.0]


In [33]:
# Apply the function to the merged dataframe
result_sdg_new['A'] = result_sdg_new.apply(find_difference_A, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A
0,1,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....",[]
1,2,"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...",[]
2,3,"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...",[]
3,4,"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...",[]
4,5,"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...",[]
5,6,"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....",[]
6,7,"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...",[]
7,8,"[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...",[]
8,9,"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...",[]
9,10,[520.0],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....",[520.0],[]


In [34]:
# Apply the function to the merged dataframe
result_sdg_new['B_and_C'] = result_sdg_new.apply(find_intersection_BC, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C
0,1,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....",[],"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35...."
1,2,"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...",[],"[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158..."
2,3,"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...",[],"[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889..."
3,4,"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...",[],"[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544..."
4,5,"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...",[],"[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579..."
5,6,"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....",[],"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885...."
6,7,"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...",[],"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905..."
7,8,"[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...",[],"[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881..."
8,9,"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...",[],"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587..."
9,10,[520.0],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....",[520.0],[],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56...."


In [35]:
# Apply the function to the merged dataframe
result_sdg_new['B_minus_all'] = result_sdg_new.apply(find_difference_B, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C,B_minus_all
0,1,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....",[],"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[882.0, 142.0, 143.0, 144.0, 183.0, 184.0, 185..."
1,2,"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...",[],"[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45...."
2,3,"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...",[],"[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[849.0, 850.0, 851.0, 852.0, 853.0, 854.0, 855..."
3,4,"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...",[],"[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[843.0, 864.0, 707.0, 708.0, 709.0, 711.0, 712..."
4,5,"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...",[],"[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[396.0, 399.0, 402.0, 405.0, 408.0, 411.0, 544..."
5,6,"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....",[],"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[870.0, 871.0, 872.0, 873.0, 874.0, 875.0, 876..."
6,7,"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...",[],"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24...."
7,8,"[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...",[],"[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[446.0, 717.0, 718.0, 870.0, 871.0, 872.0, 873..."
8,9,"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...",[],"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[849.0, 852.0, 856.0, 857.0, 858.0, 859.0, 860..."
9,10,[520.0],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....",[520.0],[],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[703.0, 613.0, 614.0, 615.0, 620.0, 621.0, 622..."


In [36]:
# Apply the function to the merged dataframe
result_sdg_new['C_minus_all'] = result_sdg_new.apply(find_difference_C, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C,B_minus_all,C_minus_all
0,1,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....",[],"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[882.0, 142.0, 143.0, 144.0, 183.0, 184.0, 185...",[]
1,2,"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...",[],"[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45....",[]
2,3,"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...",[],"[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[849.0, 850.0, 851.0, 852.0, 853.0, 854.0, 855...",[]
3,4,"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...",[],"[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[843.0, 864.0, 707.0, 708.0, 709.0, 711.0, 712...",[]
4,5,"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...",[],"[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[396.0, 399.0, 402.0, 405.0, 408.0, 411.0, 544...",[]
5,6,"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....",[],"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[870.0, 871.0, 872.0, 873.0, 874.0, 875.0, 876...",[]
6,7,"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...",[],"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24....",[]
7,8,"[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...",[],"[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[446.0, 717.0, 718.0, 870.0, 871.0, 872.0, 873...",[]
8,9,"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...",[],"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[849.0, 852.0, 856.0, 857.0, 858.0, 859.0, 860...",[]
9,10,[520.0],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....",[520.0],[],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[703.0, 613.0, 614.0, 615.0, 620.0, 621.0, 622...",[]


In [37]:
result_sdg_new.columns

Index(['Goal No.', 'top_n_attr_A', 'top_n_attr_B', 'top_n_attr_C', 'B_plus_C',
       'A_and_BC', 'A', 'B_and_C', 'B_minus_all', 'C_minus_all'],
      dtype='object')

In [38]:
# Apply the function to the merged dataframe
result_sdg_new['Rank'] = result_sdg_new.apply(final_rank, axis=1)
result_sdg_new

Unnamed: 0,Goal No.,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C,B_minus_all,C_minus_all,Rank
0,1,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....",[],"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[882.0, 142.0, 143.0, 144.0, 183.0, 184.0, 185...",[],"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35...."
1,2,"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...",[],"[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45....",[],"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159..."
2,3,"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...",[],"[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[849.0, 850.0, 851.0, 852.0, 853.0, 854.0, 855...",[],"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126..."
3,4,"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...",[],"[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[843.0, 864.0, 707.0, 708.0, 709.0, 711.0, 712...",[],"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540..."
4,5,"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...",[],"[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[396.0, 399.0, 402.0, 405.0, 408.0, 411.0, 544...",[],"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583..."
5,6,"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....",[],"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[870.0, 871.0, 872.0, 873.0, 874.0, 875.0, 876...",[],"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885...."
6,7,"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...",[],"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24....",[],"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905..."
7,8,"[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...",[],"[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[446.0, 717.0, 718.0, 870.0, 871.0, 872.0, 873...",[],"[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880..."
8,9,"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...",[],"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[849.0, 852.0, 856.0, 857.0, 858.0, 859.0, 860...",[],"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587..."
9,10,[520.0],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....",[520.0],[],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[703.0, 613.0, 614.0, 615.0, 620.0, 621.0, 622...",[],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56...."


In [39]:
final_df = result_sdg_new.copy()

In [40]:
final_df['Attr_count'] = final_df['Rank'].apply(lambda x: len(x))

In [41]:
final_df

Unnamed: 0,Goal No.,top_n_attr_A,top_n_attr_B,top_n_attr_C,B_plus_C,A_and_BC,A,B_and_C,B_minus_all,C_minus_all,Rank,Attr_count
0,1,"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....",[],"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....","[882.0, 142.0, 143.0, 144.0, 183.0, 184.0, 185...",[],"[28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35....",75
1,2,"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...",[],"[207.0, 240.0, 294.0, 426.0, 427.0, 156.0, 158...","[38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45....",[],"[207.0, 240.0, 294.0, 156.0, 157.0, 158.0, 159...",362
2,3,"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...",[],"[177.0, 178.0, 179.0, 180.0, 181.0, 182.0, 889...","[849.0, 850.0, 851.0, 852.0, 853.0, 854.0, 855...",[],"[120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126...",186
3,4,"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...",[],"[534.0, 535.0, 537.0, 538.0, 542.0, 543.0, 544...","[843.0, 864.0, 707.0, 708.0, 709.0, 711.0, 712...",[],"[534.0, 535.0, 536.0, 537.0, 538.0, 539.0, 540...",341
4,5,"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...",[],"[602.0, 876.0, 555.0, 559.0, 563.0, 576.0, 579...","[396.0, 399.0, 402.0, 405.0, 408.0, 411.0, 544...",[],"[863.0, 555.0, 559.0, 563.0, 576.0, 579.0, 583...",388
5,6,"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....",[],"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....","[870.0, 871.0, 872.0, 873.0, 874.0, 875.0, 876...",[],"[25.0, 524.0, 588.0, 598.0, 883.0, 884.0, 885....",211
6,7,"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...",[],"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...","[17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24....",[],"[523.0, 524.0, 525.0, 526.0, 527.0, 528.0, 905...",395
7,8,"[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...",[],"[416.0, 418.0, 868.0, 869.0, 879.0, 880.0, 881...","[446.0, 717.0, 718.0, 870.0, 871.0, 872.0, 873...",[],"[389.0, 416.0, 418.0, 868.0, 869.0, 879.0, 880...",118
8,9,"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...",[],"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...","[849.0, 852.0, 856.0, 857.0, 858.0, 859.0, 860...",[],"[581.0, 582.0, 583.0, 584.0, 585.0, 586.0, 587...",76
9,10,[520.0],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....",[520.0],[],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....","[703.0, 613.0, 614.0, 615.0, 620.0, 621.0, 622...",[],"[520.0, 839.0, 840.0, 841.0, 842.0, 843.0, 56....",125


In [42]:
final_df.to_csv(absolute_path_rank, sep=';', index=False)

In [43]:
all_goals = process_result(final_df, sdg, data)

In [44]:
all_goals.to_csv(absolute_path_rank_unroll, sep=';', index=False)

In [45]:
all_goals

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,top_n_goal_id
0,28.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Urban,Ration shops",End poverty in all its forms everywhere,1
1,29.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Rural,Ration shops",End poverty in all its forms everywhere,1
2,30.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Total,Ration shops",End poverty in all its forms everywhere,1
3,31.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Urban,Anthyodaya,Bpl Card Holders",End poverty in all its forms everywhere,1
4,32.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Rural,Anthyodaya,Bpl Card Holders",End poverty in all its forms everywhere,1
...,...,...,...,...,...,...,...,...
3858,632.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"No.of Institutions,Polytechnic- Government","Data, monitoring and accountability",17
3859,633.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"Male,Istyear,Polytechnic- Government","Data, monitoring and accountability",17
3860,634.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"Female,Istyear,Polytechnic- Government","Data, monitoring and accountability",17
3861,635.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"Total,Istyear,Polytechnic- Government","Data, monitoring and accountability",17


In [46]:
# Group by 'col1' and count the distinct values in 'col2'
table_map = data_new.groupby('Chapter_id')['Table_id'].nunique().to_dict()
table_map

{'1': 10,
 '10': 15,
 '11': 6,
 '12': 7,
 '13': 6,
 '14': 5,
 '15': 7,
 '16': 3,
 '17': 3,
 '18': 1,
 '2': 11,
 '3': 2,
 '4': 18,
 '5': 4,
 '6': 5,
 '7': 2,
 '8': 4,
 '9': 4}

In [47]:
attr_map = data_new.groupby('Table_id')['Attr_id'].nunique().to_dict()
attr_map

{'1.1': 5,
 '1.10': 6,
 '1.2': 4,
 '1.3': 6,
 '1.4': 6,
 '1.5': 4,
 '1.6': 6,
 '1.7': 6,
 '1.8': 6,
 '1.9': 6,
 '10.1': 9,
 '10.10': 10,
 '10.11': 11,
 '10.12': 8,
 '10.13': 12,
 '10.14': 8,
 '10.15': 8,
 '10.2': 10,
 '10.3': 4,
 '10.4': 4,
 '10.5': 4,
 '10.6': 5,
 '10.7': 5,
 '10.8': 6,
 '10.9': 10,
 '11.1': 12,
 '11.2': 7,
 '11.3': 10,
 '11.4': 6,
 '11.5': 32,
 '11.6': 8,
 '12.1': 18,
 '12.2': 24,
 '12.3': 24,
 '12.4': 6,
 '12.5': 18,
 '12.6': 21,
 '13.1': 5,
 '13.2': 4,
 '13.3': 3,
 '13.4': 4,
 '13.5': 8,
 '13.6': 3,
 '14.1': 6,
 '14.2': 4,
 '14.3': 7,
 '14.4': 6,
 '14.5': 6,
 '15.1': 7,
 '15.2': 6,
 '15.3': 9,
 '15.4': 8,
 '15.5': 4,
 '15.6': 3,
 '15.7': 5,
 '16.1': 7,
 '16.2': 6,
 '16.3': 8,
 '17.1': 4,
 '17.2': 7,
 '17.3': 13,
 '18.3': 192,
 '2.1': 12,
 '2.11': 9,
 '2.12': 11,
 '2.2': 6,
 '2.3': 6,
 '2.4': 9,
 '2.5': 6,
 '2.6': 6,
 '2.7': 6,
 '2.8': 6,
 '2.9': 6,
 '3.1': 5,
 '3.2': 11,
 '4.1': 15,
 '4.10': 9,
 '4.11': 9,
 '4.12': 18,
 '4.13': 9,
 '4.14': 3,
 '4.15': 5,
 '4.16': 4

In [48]:
all_goals

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,top_n_goal_id
0,28.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Urban,Ration shops",End poverty in all its forms everywhere,1
1,29.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Rural,Ration shops",End poverty in all its forms everywhere,1
2,30.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Total,Ration shops",End poverty in all its forms everywhere,1
3,31.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Urban,Anthyodaya,Bpl Card Holders",End poverty in all its forms everywhere,1
4,32.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Rural,Anthyodaya,Bpl Card Holders",End poverty in all its forms everywhere,1
...,...,...,...,...,...,...,...,...
3858,632.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"No.of Institutions,Polytechnic- Government","Data, monitoring and accountability",17
3859,633.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"Male,Istyear,Polytechnic- Government","Data, monitoring and accountability",17
3860,634.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"Female,Istyear,Polytechnic- Government","Data, monitoring and accountability",17
3861,635.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"Total,Istyear,Polytechnic- Government","Data, monitoring and accountability",17


In [49]:
filter_df = all_goals.copy()

In [50]:
# Initialize an empty dictionary to store the counts
tab_result = {}
table_id_rm = {}
# Iterate over unique values in col1
for val in filter_df['top_n_goal_id'].unique():
    # Filter the dataframe by the current value of col1
    filtered_df = filter_df[filter_df['top_n_goal_id'] == val]
    
    # Count the occurrences of each value in col2 and store in a dictionary
    table_counts = filtered_df['Table_id'].value_counts().to_dict()
    
    table_id_rm[val] = {key:value for key, value in table_counts.items() if value < (attr_map[key]/2)}
    
    table_dict_new = {key:value for key, value in table_counts.items() if key not in table_id_rm[val]}
    
    if(len(table_dict_new) <= 4):
        # Sort the dictionary based on values in descending order
        sorted_dict = dict(sorted(table_counts.items(), key=lambda x: x[1], reverse=True))
        # Keep only the top 2 elements
        top_6_dict = dict(list(sorted_dict.items())[:6])
        tab_result[val] = top_6_dict
        
    else:
        # Add the col2 counts dictionary to the result dictionary
        tab_result[val] = table_dict_new

print(tab_result)

{'1': {'4.2': 19, '11.6': 8, '1.6': 6, '1.7': 6, '1.8': 6, '14.5': 6, '3.1': 5, '14.3': 4, '13.6': 3}, '2': {'11.5': 32, '4.3': 20, '4.18': 19, '4.2': 19, '4.12': 18, '4.6': 18, '4.17': 16, '4.1': 15, '11.1': 12, '4.4': 10, '11.3': 10, '4.7': 9, '4.10': 9, '4.11': 9, '4.13': 9, '11.6': 8, '13.5': 8, '15.4': 8, '14.3': 7, '11.2': 7, '14.5': 6, '14.1': 6, '11.4': 6, '1.8': 6, '14.4': 6, '1.6': 6, '4.9': 6, '1.7': 6, '13.1': 5, '7.1': 5, '4.15': 5, '13.2': 4, '13.4': 4, '4.16': 4, '14.2': 4, '4.8': 3, '4.14': 3, '4.5': 3, '13.6': 3, '13.3': 3}, '3': {'11.5': 32, '11.1': 12, '11.3': 10, '2.11': 9, '11.6': 8, '13.5': 8, '11.2': 7, '14.3': 7, '1.7': 6, '11.4': 6, '14.4': 6, '14.1': 6, '1.8': 6, '2.4': 6, '14.5': 6, '10.8': 6, '13.1': 5, '3.1': 5, '13.4': 4, '14.2': 4, '13.2': 4, '13.6': 3, '13.3': 3}, '4': {'11.5': 17, '4.17': 16, '12.5': 12, '12.3': 12, '12.2': 12, '7.2': 12, '10.13': 12, '10.11': 11, '10.9': 10, '10.10': 10, '10.2': 10, '10.1': 9, '15.3': 9, '15.4': 8, '13.5': 8, '10.15': 

In [51]:
len(tab_result)

17

In [52]:
table_id_rm

{'1': {'9.1': 5, '5.4': 4, '4.11': 1, '15.3': 1, '5.3': 1},
 '2': {'6.4': 2, '15.3': 2, '5.3': 1},
 '3': {'4.2': 6, '9.1': 5, '5.4': 2, '17.2': 2, '5.1': 1, '15.3': 1},
 '4': {'18.3': 24, '12.1': 6, '9.1': 6, '6.3': 2, '5.4': 1},
 '5': {'18.3': 48, '11.5': 8, '6.1': 6, '1.4': 1, '14.1': 1},
 '6': {'4.2': 3, '1.5': 1, '5.4': 1},
 '7': {'4.1': 3, '8.4': 2, '9.4': 1},
 '8': {'18.3': 24,
  '4.2': 6,
  '9.1': 5,
  '11.6': 2,
  '5.4': 2,
  '6.3': 2,
  '3.1': 2,
  '15.3': 1,
  '16.2': 1,
  '4.18': 1,
  '1.5': 1},
 '9': {},
 '10': {'11.5': 6,
  '12.1': 6,
  '10.13': 4,
  '10.11': 2,
  '14.1': 2,
  '16.2': 1,
  '9.2': 1},
 '11': {'4.2': 6,
  '11.5': 6,
  '9.1': 5,
  '9.3': 2,
  '1.2': 1,
  '4.1': 1,
  '15.3': 1,
  '1.5': 1},
 '12': {'4.18': 1, '16.2': 1},
 '13': {'11.5': 6, '5.4': 3, '14.1': 2, '8.1': 1, '8.2': 1, '4.1': 1},
 '14': {'11.5': 6, '4.13': 1, '2.1': 1, '4.4': 1, '1.5': 1, '4.12': 1},
 '15': {'11.5': 8, '12.6': 6, '12.1': 6, '10.13': 4, '10.11': 2, '9.1': 1},
 '16': {'11.5': 6, '5.4'

In [53]:
sum_ind=0
# Iterate over the outer dictionary
for key, inner_dict in table_id_rm.items():
    # Iterate over the inner dictionary
    for k, value in inner_dict.items():
        if k not in tab_result[key]:
            # Add the value to the result
            sum_ind += value

print(sum_ind)

304


In [54]:
# List to store indices to remove
indices_to_remove = []
# Iterate over the rows of the DataFrame
for index, row in filter_df.iterrows():
    if row['top_n_goal_id'] in table_id_rm:
        inner_dict = table_id_rm[row['top_n_goal_id']]
        if row['Table_id'] in inner_dict:
            if row['Table_id'] not in tab_result[row['top_n_goal_id']]:
                # Add index to remove list
                indices_to_remove.append(index)
            
indices_to_remove

[18,
 19,
 22,
 42,
 43,
 44,
 69,
 70,
 71,
 72,
 73,
 74,
 270,
 271,
 305,
 318,
 319,
 446,
 447,
 448,
 449,
 450,
 451,
 460,
 461,
 462,
 463,
 464,
 465,
 466,
 467,
 484,
 558,
 559,
 747,
 748,
 759,
 770,
 837,
 838,
 839,
 840,
 841,
 842,
 843,
 844,
 845,
 846,
 847,
 848,
 849,
 850,
 851,
 852,
 853,
 854,
 855,
 856,
 857,
 858,
 859,
 910,
 911,
 912,
 913,
 914,
 921,
 922,
 923,
 924,
 925,
 926,
 963,
 1044,
 1048,
 1088,
 1089,
 1090,
 1091,
 1092,
 1093,
 1103,
 1104,
 1105,
 1106,
 1107,
 1108,
 1109,
 1110,
 1111,
 1112,
 1113,
 1114,
 1115,
 1116,
 1117,
 1118,
 1119,
 1120,
 1121,
 1122,
 1123,
 1124,
 1125,
 1126,
 1127,
 1128,
 1129,
 1130,
 1131,
 1132,
 1133,
 1134,
 1135,
 1136,
 1137,
 1138,
 1139,
 1140,
 1141,
 1142,
 1143,
 1144,
 1145,
 1146,
 1147,
 1148,
 1149,
 1150,
 1151,
 1152,
 1153,
 1154,
 1155,
 1156,
 1157,
 1158,
 1352,
 1541,
 1542,
 1543,
 1544,
 1602,
 1603,
 1604,
 1878,
 1933,
 1934,
 1958,
 1959,
 1960,
 1967,
 1968,
 1969,
 1970,


In [55]:
len(indices_to_remove)

304

In [56]:
# Remove rows using indices from indices_to_remove list
filter_df.drop(indices_to_remove, inplace=True)

filter_df

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,top_n_goal_id
0,28.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Urban,Ration shops",End poverty in all its forms everywhere,1
1,29.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Rural,Ration shops",End poverty in all its forms everywhere,1
2,30.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Total,Ration shops",End poverty in all its forms everywhere,1
3,31.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Urban,Anthyodaya,Bpl Card Holders",End poverty in all its forms everywhere,1
4,32.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Rural,Anthyodaya,Bpl Card Holders",End poverty in all its forms everywhere,1
...,...,...,...,...,...,...,...,...
3858,632.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"No.of Institutions,Polytechnic- Government","Data, monitoring and accountability",17
3859,633.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"Male,Istyear,Polytechnic- Government","Data, monitoring and accountability",17
3860,634.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"Female,Istyear,Polytechnic- Government","Data, monitoring and accountability",17
3861,635.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"Total,Istyear,Polytechnic- Government","Data, monitoring and accountability",17


In [57]:
filter_df.to_csv(absolute_path_tab_filter, sep=';', index=False)

In [58]:
tab_count_att = filter_df['top_n_goal_id'].value_counts().to_dict()

In [59]:
filter_df.to_csv(absolute_path_final_map, sep=';', index=False)

In [60]:
chapter_df = filter_df.copy()

In [61]:
# Initialize an empty dictionary to store the counts
chap_result = {}
chap_id_rm = {}

# Iterate over unique values in col1
for val in chapter_df['top_n_goal_id'].unique():
    # Filter the dataframe by the current value of col1
    filtered_df = chapter_df[chapter_df['top_n_goal_id'] == val]
    chap_list = []
    for tab in filtered_df['Table_id'].unique():
        chap_filter = filtered_df[filtered_df['Table_id'] == tab]
        
        # Count the occurrences of each value in col2 and store in a dictionary
        chap_counts = chap_filter['Chapter_id'].unique().tolist()
        
        chap_list.extend(chap_counts)
    chap_dict = pd.Series(chap_list).value_counts().to_dict()
    chap_id_rm[val] = {key:value for key, value in chap_dict.items() if value < (table_map[key]/2)}
    chap_dict_new = {key:value for key, value in chap_dict.items() if key not in chap_id_rm[val]}
    if(len(chap_dict_new) <= 1):
        # Sort the dictionary based on values in descending order
        sorted_dict = dict(sorted(chap_dict.items(), key=lambda x: x[1], reverse=True))
        # Keep only the top 2 elements
        top_4_dict = dict(list(sorted_dict.items())[:4])
        chap_result[val] = top_4_dict
 
    else:
        # Add the col2 counts dictionary to the result dictionary
        chap_result[val] = chap_dict_new

print(chap_result)

{'1': {'1': 3, '14': 2, '3': 1, '11': 1}, '2': {'4': 18, '13': 6, '11': 6, '14': 5, '7': 1}, '3': {'11': 6, '13': 6, '14': 5, '3': 1}, '4': {'10': 15, '13': 6, '14': 5, '12': 4, '7': 1}, '5': {'10': 15, '12': 7, '2': 7, '13': 6}, '6': {'11': 6, '13': 6, '14': 5, '8': 4}, '7': {'1': 10, '15': 7, '13': 6, '14': 5, '16': 3, '18': 1}, '8': {'13': 6, '14': 5}, '9': {'13': 6, '14': 5}, '10': {'1': 4, '12': 4, '10': 3, '13': 2}, '11': {'13': 6, '14': 5, '18': 1}, '12': {'13': 6, '14': 5}, '13': {'2': 3, '13': 2, '1': 2, '15': 1}, '14': {'13': 6, '14': 5, '18': 1}, '15': {'1': 10, '13': 6, '14': 5, '12': 4, '8': 2, '18': 1}, '16': {'13': 6, '14': 5, '6': 5}, '17': {'10': 15, '7': 1}}


In [62]:
chap_id_rm

{'1': {'1': 3, '14': 2, '11': 1, '4': 1, '13': 1},
 '2': {'1': 3, '15': 1},
 '3': {'2': 2, '1': 2, '10': 1},
 '4': {'15': 3, '1': 2, '4': 2, '11': 1, '9': 1},
 '5': {'1': 2, '14': 1, '17': 1},
 '6': {'10': 2, '1': 2, '9': 1, '15': 1},
 '7': {'9': 1, '11': 1},
 '8': {'6': 1, '17': 1, '16': 1},
 '9': {'10': 2},
 '10': {'1': 4, '10': 3, '13': 2, '16': 1, '17': 1, '14': 1},
 '11': {'2': 2, '11': 2, '15': 1},
 '12': {'4': 7, '11': 2, '1': 2, '5': 1, '16': 1, '17': 1},
 '13': {'2': 3, '13': 2, '1': 2, '15': 1},
 '14': {'1': 3, '2': 3, '4': 1},
 '15': {'10': 5, '5': 1, '9': 1, '11': 1},
 '16': {'2': 3},
 '17': {'11': 2}}

In [63]:
missing = []

# Iterate through the data dictionary
for key, value in chap_result.items():
    # Check if the sub-dictionary is empty
    if not value:
        # Add the key to the missing list
        missing.append(key)
missing

[]

In [64]:
# List to store indices to remove
chap_indices_to_remove = []
# Iterate over the rows of the DataFrame
for index, row in chapter_df.iterrows():
    if row['top_n_goal_id'] in table_id_rm:
        inner_dict = chap_id_rm[row['top_n_goal_id']]
        if row['Chapter_id'] in inner_dict:
            if row['top_n_goal_id'] not in chap_result or row['Chapter_id'] not in chap_result[row['top_n_goal_id']]:
                # Add index to remove list
                chap_indices_to_remove.append(index)

chap_indices_to_remove

[28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 62,
 63,
 64,
 293,
 294,
 295,
 296,
 297,
 298,
 299,
 300,
 301,
 302,
 320,
 321,
 322,
 323,
 324,
 325,
 326,
 327,
 429,
 430,
 431,
 432,
 433,
 434,
 435,
 436,
 437,
 438,
 439,
 440,
 441,
 442,
 443,
 444,
 445,
 452,
 453,
 454,
 455,
 456,
 457,
 485,
 486,
 487,
 488,
 489,
 490,
 611,
 612,
 613,
 614,
 615,
 616,
 617,
 618,
 619,
 620,
 621,
 622,
 737,
 738,
 739,
 740,
 741,
 742,
 755,
 756,
 757,
 758,
 760,
 761,
 762,
 763,
 764,
 765,
 766,
 767,
 768,
 769,
 773,
 774,
 775,
 776,
 777,
 778,
 779,
 780,
 781,
 782,
 783,
 784,
 785,
 786,
 787,
 788,
 789,
 790,
 791,
 792,
 793,
 794,
 795,
 796,
 797,
 798,
 799,
 800,
 801,
 814,
 815,
 816,
 817,
 818,
 819,
 820,
 821,
 822,
 823,
 824,
 825,
 826,
 827,
 828,
 829,
 830,
 831,
 832,
 833,
 834,
 835,
 836,
 904,
 905,
 906,
 907,
 908,
 909,
 1045,
 1047,
 1062,
 1063,
 1064,
 1065,
 1066,
 1183,
 1184,
 1185

In [65]:
len(chap_indices_to_remove)

618

In [66]:
# Remove rows using indices from indices_to_remove list
chapter_df.drop(chap_indices_to_remove, inplace=True)

chapter_df

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,top_n_goal_id
0,28.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Urban,Ration shops",End poverty in all its forms everywhere,1
1,29.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Rural,Ration shops",End poverty in all its forms everywhere,1
2,30.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Total,Ration shops",End poverty in all its forms everywhere,1
3,31.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Urban,Anthyodaya,Bpl Card Holders",End poverty in all its forms everywhere,1
4,32.0,1,General Information,1.6,Ration Shops And B.p.l. Ration Card Holders on...,"Rural,Anthyodaya,Bpl Card Holders",End poverty in all its forms everywhere,1
...,...,...,...,...,...,...,...,...
3858,632.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"No.of Institutions,Polytechnic- Government","Data, monitoring and accountability",17
3859,633.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"Male,Istyear,Polytechnic- Government","Data, monitoring and accountability",17
3860,634.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"Female,Istyear,Polytechnic- Government","Data, monitoring and accountability",17
3861,635.0,10,Education,10.14,No.of Polytechnic Colleges and Students Year- ...,"Total,Istyear,Polytechnic- Government","Data, monitoring and accountability",17


In [67]:
chapter_df['top_n_goal_id'].nunique()

17

In [68]:
chapter_df[chapter_df['top_n_goal_id']=='3']

Unnamed: 0,Attr_id,Chapter_id,Chapter_name,Table_id,Table_name,Description,Goal,top_n_goal_id
458,889.0,14,Rural Development & Panchayat Raj,14.5,Different Schemes of drinking water supply fac...,"Provided During the current year,Bore Wells",Ensure healthy lives and promote well-being fo...,3
459,890.0,14,Rural Development & Panchayat Raj,14.5,Different Schemes of drinking water supply fac...,"Total,Bore Wells",Ensure healthy lives and promote well-being fo...,3
468,701.0,11,Health & Family welfare Services,11.5,Tb Control Programmes on 31-03-2016 in Nos.,"Total,Identified Leprosy Patients,Leprosy Cont...",Ensure healthy lives and promote well-being fo...,3
469,704.0,11,Health & Family welfare Services,11.5,Tb Control Programmes on 31-03-2016 in Nos.,"Total,No.of Patients Cured,Leprosy Control Pro...",Ensure healthy lives and promote well-being fo...,3
470,699.0,11,Health & Family welfare Services,11.5,Tb Control Programmes on 31-03-2016 in Nos.,"Male,Identified Leprosy Patients,Leprosy Contr...",Ensure healthy lives and promote well-being fo...,3
...,...,...,...,...,...,...,...,...
606,717.0,11,Health & Family welfare Services,11.6,Different Health Facilities As on 31-03-2016 i...,"Nos.,24 hours and 7 days working Hospitals",Ensure healthy lives and promote well-being fo...,3
607,718.0,11,Health & Family welfare Services,11.6,Different Health Facilities As on 31-03-2016 i...,"No.of delivaries in this Hospitals,24 hours an...",Ensure healthy lives and promote well-being fo...,3
608,719.0,11,Health & Family welfare Services,11.6,Different Health Facilities As on 31-03-2016 i...,No.of benificiaries in Jananni Suraksh Yojana,Ensure healthy lives and promote well-being fo...,3
609,720.0,11,Health & Family welfare Services,11.6,Different Health Facilities As on 31-03-2016 i...,No.of benificiaries medical kit,Ensure healthy lives and promote well-being fo...,3


In [69]:
chap_count_att = chapter_df['top_n_goal_id'].value_counts().to_dict()

In [70]:
chap_count_att

{'7': 366,
 '15': 361,
 '2': 331,
 '5': 298,
 '14': 248,
 '11': 248,
 '4': 224,
 '6': 159,
 '3': 136,
 '17': 126,
 '16': 109,
 '10': 87,
 '9': 56,
 '12': 56,
 '8': 56,
 '1': 41,
 '13': 39}

In [71]:
tab_count_att

{'15': 402,
 '7': 389,
 '2': 357,
 '5': 324,
 '4': 302,
 '14': 299,
 '11': 286,
 '6': 206,
 '12': 183,
 '3': 169,
 '17': 163,
 '16': 127,
 '10': 103,
 '9': 76,
 '8': 71,
 '1': 63,
 '13': 39}

In [72]:
rank_count_att = all_goals['top_n_goal_id'].value_counts().to_dict()

In [73]:
count_att_list = [(key, value) for key, value in chap_count_att.items()]

count_att_df = pd.DataFrame(count_att_list, columns=['Goal No.', 'Chap_count'])
count_att_df

Unnamed: 0,Goal No.,Chap_count
0,7,366
1,15,361
2,2,331
3,5,298
4,14,248
5,11,248
6,4,224
7,6,159
8,3,136
9,17,126


In [74]:
count_att_df['Tab_count'] = count_att_df['Goal No.'].map(tab_count_att)

In [75]:
count_att_df['Rank_count'] = count_att_df['Goal No.'].map(rank_count_att)

In [76]:
count_att_df

Unnamed: 0,Goal No.,Chap_count,Tab_count,Rank_count
0,7,366,389,395
1,15,361,402,429
2,2,331,357,362
3,5,298,324,388
4,14,248,299,310
5,11,248,286,309
6,4,224,302,341
7,6,159,206,211
8,3,136,169,186
9,17,126,163,164


In [77]:
chapter_df.to_csv(absolute_path_chap_filter, sep=';', index=False)

In [78]:
count_att_df.to_csv(absolute_path_count, sep=';', index=False)