In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import re
import matplotlib.pyplot as plt
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to /home/riya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/riya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/riya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  "class": algorithms.Blowfish,


In [2]:
# Define a custom function to remove word before 1st space and excess white spaces
def process_string(input_string):
    # Find the index of the first space
    first_space_index = input_string.find(' ')
    
    # Remove the word before the first space
    if first_space_index != -1:
        input_string = input_string[first_space_index+1:]
    
    # Remove excess white spaces
    input_string = ' '.join(input_string.split())
    
    return input_string

In [3]:
# Creating our tokenizer function
def cleaning(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    stopwords = nltk.corpus.stopwords.words('english')

    cleaned = []
    for i in range(len(sentence)):
        #non English character
        letters = re.sub('[^a-zA-Z]', ' ', sentence[i])
        #print("Non English : ",letters)

        #lowercase
        letters = letters.lower().split() 

        wn = nltk.WordNetLemmatizer()
        letters = [wn.lemmatize(w) for w in letters if w not in stopwords ]

        #ignore_words
        letters = ' '.join(letters)
        cleaned.append(letters)

    # return preprocessed list of tokens
    return cleaned

In [4]:
sdg = pd.read_csv("sdg_data_excel.csv")

In [5]:
sdg

Unnamed: 0,Goal No.,Goal,Nodal Department,Targets,Other Related Major Departments,Tentative Indicators,Target_id,Indicator_id
0,1,End poverty in all its forms everywhere,Rural Development,"1.1 By 2030, eradicate extreme poverty for ...","Urban Development, Agriculture, Horticulture, ...",1.1.1 Proportion of the population below...,1.1,1.1.1
1,1,End poverty in all its forms everywhere,Rural Development,"1.2 By 2030, reduce at least by ...","Urban Development, Agriculture, Horticulture, ...",1.2.1 Proportion of the population livin...,1.2,1.2.1
2,1,End poverty in all its forms everywhere,Rural Development,"1.2 By 2030, reduce at least by ...","Urban Development, Agriculture, Horticulture, ...","1.2.2 Proportion of men, women and...",1.2,1.2.2
3,1,End poverty in all its forms everywhere,Rural Development,1.3 Implement nationally appropriate so...,"Urban Development, Agriculture, Horticulture, ...",1.3.1 Percentage of the populati...,1.3,1.3.1
4,1,End poverty in all its forms everywhere,Rural Development,"1.4 By 2030, ensure that all men and women,...","Urban Development, Agriculture, Horticulture, ...",1.4.1 Proportion of the population living in ...,1.4,1.4.1
...,...,...,...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,Finance,"16.4 Promote the development, tran...",Environment and Scientific Technology.,16.4.1 Total amount of approved funding t...,16.4,16.4.1
165,16,Strengthen the means of implementation and rev...,Finance,16.5 Fully operationalize the technology ...,"Environment and Scientific Technology, Inform...",16.5.1 Proportion of individuals using th...,16.5,16.5.1
166,17,"Data, monitoring and accountability",Finance,"17.1 By 2020, enhance capacity-buil...","Planning, Finance, Economic and Statistics.",17.1.1 Proportion of sustain...,17.1,17.1.1
167,17,"Data, monitoring and accountability",Finance,"17.2 By 2030, build on existing initiati...","Planning, Finance, Economic and Statistics.",17.2.1 Dollar value of all resources made avai...,17.2,17.2.1


In [6]:
sdg.columns

Index(['Goal No.', 'Goal', 'Nodal Department', 'Targets',
       'Other Related Major Departments', 'Tentative Indicators', 'Target_id',
       'Indicator_id'],
      dtype='object')

In [7]:
sdg.drop(['Nodal Department',
       'Other Related Major Departments',
       'Indicator_id'],axis=1,inplace=True)

In [8]:
# Remove duplicate rows
sdg = sdg.drop_duplicates()

In [9]:
sdg.reset_index(inplace = True, drop = True)

In [10]:
sdg

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id
0,1,End poverty in all its forms everywhere,"1.1 By 2030, eradicate extreme poverty for ...",1.1.1 Proportion of the population below...,1.1
1,1,End poverty in all its forms everywhere,"1.2 By 2030, reduce at least by ...",1.2.1 Proportion of the population livin...,1.2
2,1,End poverty in all its forms everywhere,"1.2 By 2030, reduce at least by ...","1.2.2 Proportion of men, women and...",1.2
3,1,End poverty in all its forms everywhere,1.3 Implement nationally appropriate so...,1.3.1 Percentage of the populati...,1.3
4,1,End poverty in all its forms everywhere,"1.4 By 2030, ensure that all men and women,...",1.4.1 Proportion of the population living in ...,1.4
...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"16.4 Promote the development, tran...",16.4.1 Total amount of approved funding t...,16.4
165,16,Strengthen the means of implementation and rev...,16.5 Fully operationalize the technology ...,16.5.1 Proportion of individuals using th...,16.5
166,17,"Data, monitoring and accountability","17.1 By 2020, enhance capacity-buil...",17.1.1 Proportion of sustain...,17.1
167,17,"Data, monitoring and accountability","17.2 By 2030, build on existing initiati...",17.2.1 Dollar value of all resources made avai...,17.2


In [11]:
# Apply the custom function to each cell in the 'Column' column
sdg['Tentative Indicators'] = sdg['Tentative Indicators'].apply(process_string)
sdg['Targets'] = sdg['Targets'].apply(process_string) 
sdg

Unnamed: 0,Goal No.,Goal,Targets,Tentative Indicators,Target_id
0,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",Proportion of the population below the interna...,1.1
1,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",Proportion of the population living below the ...,1.2
2,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...","Proportion of men, women and children of all a...",1.2
3,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,Percentage of the population covered by social...,1.3
4,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",Proportion of the population living in househo...,1.4
...,...,...,...,...,...
164,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",Total amount of approved funding to promote th...,16.4
165,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,Proportion of individuals using the Internet.,16.5
166,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",Proportion of sustainable development indicato...,17.1
167,17,"Data, monitoring and accountability","By 2030, build on existing initiatives to deve...",Dollar value of all resources made available t...,17.2


In [12]:
sdg_new = sdg.groupby(['Goal No.', 'Goal', 'Targets',  'Target_id'])['Tentative Indicators'].apply(lambda x: ', '.join(x)).reset_index()
sdg_new

Unnamed: 0,Goal No.,Goal,Targets,Target_id,Tentative Indicators
0,1,End poverty in all its forms everywhere,"By 2030, build the resilience of the poor and ...",1.5,"Number of deaths, missing people, injured, rel..."
1,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",1.4,Proportion of the population living in househo...
2,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",1.1,Proportion of the population below the interna...
3,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",1.2,Proportion of the population living below the ...
4,1,End poverty in all its forms everywhere,Create sound policy frameworks at the state le...,1.6,Number of state action plans related to multil...
...,...,...,...,...,...
127,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,16.5,Proportion of individuals using the Internet.
128,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",16.4,Total amount of approved funding to promote th...
129,16,Strengthen the means of implementation and rev...,"Strengthen domestic resource mobilization, inc...",16.1,Total government revenue (by source) as a perc...
130,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",17.1,Proportion of sustainable development indicato...


In [13]:
sdg_new['concat_value'] = sdg_new['Targets'] + ' ' + sdg_new['Tentative Indicators']
sdg_new.drop(['Tentative Indicators'], axis=1,inplace=True)

sdg_new.rename(columns = {'concat_value':'Tentative Indicators'}, inplace = True) 
sdg_new

Unnamed: 0,Goal No.,Goal,Targets,Target_id,Tentative Indicators
0,1,End poverty in all its forms everywhere,"By 2030, build the resilience of the poor and ...",1.5,"By 2030, build the resilience of the poor and ..."
1,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",1.4,"By 2030, ensure that all men and women, in par..."
2,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",1.1,"By 2030, eradicate extreme poverty for all peo..."
3,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",1.2,"By 2030, reduce at least by half the proportio..."
4,1,End poverty in all its forms everywhere,Create sound policy frameworks at the state le...,1.6,Create sound policy frameworks at the state le...
...,...,...,...,...,...
127,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,16.5,Fully operationalize the technology bank and s...
128,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",16.4,"Promote the development, transfer, disseminati..."
129,16,Strengthen the means of implementation and rev...,"Strengthen domestic resource mobilization, inc...",16.1,"Strengthen domestic resource mobilization, inc..."
130,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",17.1,"By 2020, enhance capacity-building support to ..."


In [14]:
texto_data=sdg_new['Tentative Indicators'].tolist()
process_text=cleaning(texto_data)

cleaned_sent =pd.DataFrame(process_text)
sdg_new['new_targets']=cleaned_sent

In [15]:
type(sdg_new['new_targets'].loc[0])

str

In [16]:
sdg_new

Unnamed: 0,Goal No.,Goal,Targets,Target_id,Tentative Indicators,new_targets
0,1,End poverty in all its forms everywhere,"By 2030, build the resilience of the poor and ...",1.5,"By 2030, build the resilience of the poor and ...",build resilience poor vulnerable situation red...
1,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",1.4,"By 2030, ensure that all men and women, in par...",ensure men woman particular poor vulnerable eq...
2,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",1.1,"By 2030, eradicate extreme poverty for all peo...",eradicate extreme poverty people everywhere cu...
3,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",1.2,"By 2030, reduce at least by half the proportio...",reduce least half proportion men woman child a...
4,1,End poverty in all its forms everywhere,Create sound policy frameworks at the state le...,1.6,Create sound policy frameworks at the state le...,create sound policy framework state level base...
...,...,...,...,...,...,...
127,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,16.5,Fully operationalize the technology bank and s...,fully operationalize technology bank science t...
128,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",16.4,"Promote the development, transfer, disseminati...",promote development transfer dissemination dif...
129,16,Strengthen the means of implementation and rev...,"Strengthen domestic resource mobilization, inc...",16.1,"Strengthen domestic resource mobilization, inc...",strengthen domestic resource mobilization incl...
130,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",17.1,"By 2020, enhance capacity-building support to ...",enhance capacity building support increase sig...


In [18]:
data = pd.read_csv("Attributes_2019-20.csv", sep=";")

In [19]:
data.columns

Index(['Attr_id', 'Chapter_id', 'Chapter_name', 'Table_id', 'Table_name',
       'Description'],
      dtype='object')

In [20]:
data.drop(['Attr_id', 'Table_id'],axis=1,inplace=True)

In [21]:
data

Unnamed: 0,Chapter_id,Chapter_name,Table_name,Description
0,1,General Information,Taluks Hoblies Grama Panchayath Village Accoun...,Taluks
1,1,General Information,Taluks Hoblies Grama Panchayath Village Accoun...,Hoblies
2,1,General Information,Taluks Hoblies Grama Panchayath Village Accoun...,Grama Panchayaths
3,1,General Information,Taluks Hoblies Grama Panchayath Village Accoun...,Circles
4,1,General Information,Inhabited Un-inhabited and Total Villages per ...,"Un-Inhabited Inhabited,Villages"
...,...,...,...,...
1548,17,Other Information,Registration of Births and Deaths,"Total,Births,Late Registered events 2019"
1549,17,Other Information,Registration of Births and Deaths,"Male,Deaths,Late Registered events 2019"
1550,17,Other Information,Registration of Births and Deaths,"Female,Deaths,Late Registered events 2019"
1551,17,Other Information,Registration of Births and Deaths,"Transgender,Deaths,Late Registered events 2019"


In [22]:
data_new = data.groupby(['Chapter_id', 'Chapter_name', 'Table_name'])['Description'].apply(lambda x: ', '.join(x)).reset_index()
data_new

Unnamed: 0,Chapter_id,Chapter_name,Table_name,Description
0,1,General Information,Anthyodaya Card Holders with and without gas c...,"Urban,with cylinder, Rural,with cylinder, Tota..."
1,1,General Information,District Income and Per Capita Income,Gross District Domestic Product Gddp Rs. in La...
2,1,General Information,Inhabited Un-inhabited and Total Villages per ...,"Un-Inhabited Inhabited,Villages, Un-Inhabited ..."
3,1,General Information,No. of Fire Stations on 31-3-2020,"Fire Brigade Stations, Working Fire Brigade Wa..."
4,1,General Information,Non-Priority Card Holders with and without gas...,"Urban,with cylinder, Rural,with cylinder, Tota..."
...,...,...,...,...
140,16,Energy,Sectorwise number of Electricity Consumers as ...,"Lt,Water Supply, Ht,Others, Lt,Others, Ht,Tota..."
141,16,Energy,Sectorwise number of Electricity Consumers on ...,"Ht,Domestic, Lt,Domestic, Ht,Industrial, Lt,In..."
142,17,Other Information,Excise Shops and Consumption of Liquor 2019-20,"Excise Shops In Nos, Iml Liquor,Consumption of..."
143,17,Other Information,Number of Pensioners under Social Security Sch...,"Pensioners,Indira Gandhi National Old Age Pens..."


In [23]:
data_new['concat_value'] = data_new['Table_name'] + ' ' + data_new['Description']

# Create a new DataFrame with selected columns
new_df = data_new[['Chapter_id', 'Chapter_name', 'Description']]
new_df

Unnamed: 0,Chapter_id,Chapter_name,Description
0,1,General Information,"Urban,with cylinder, Rural,with cylinder, Tota..."
1,1,General Information,Gross District Domestic Product Gddp Rs. in La...
2,1,General Information,"Un-Inhabited Inhabited,Villages, Un-Inhabited ..."
3,1,General Information,"Fire Brigade Stations, Working Fire Brigade Wa..."
4,1,General Information,"Urban,with cylinder, Rural,with cylinder, Tota..."
...,...,...,...
140,16,Energy,"Lt,Water Supply, Ht,Others, Lt,Others, Ht,Tota..."
141,16,Energy,"Ht,Domestic, Lt,Domestic, Ht,Industrial, Lt,In..."
142,17,Other Information,"Excise Shops In Nos, Iml Liquor,Consumption of..."
143,17,Other Information,"Pensioners,Indira Gandhi National Old Age Pens..."


In [24]:
new_df = new_df.groupby(['Chapter_id', 'Chapter_name',])['Description'].apply(lambda x: ', '.join(x)).reset_index()
new_df

Unnamed: 0,Chapter_id,Chapter_name,Description
0,1,General Information,"Urban,with cylinder, Rural,with cylinder, Tota..."
1,2,Area and Population,"Male,0-14 2011, Female,0-14 2011, Total,0-14 2..."
2,3,Rainfall,"2009,Actual Annual Rainfall from 2009 to 2019 ..."
3,4,"Agriculture, Horticulture & Sericulture","Male,Numbers,Marginal Agricultural Land Holder..."
4,5,Fisheries,"Indigenous Breed,Cattle, Exotic Breed,Cattle, ..."
5,6,Industries,"Working Factories Nos,No. of Working Factories..."
6,7,Banks,"No. of Branches,Regional Rural Banks In Lakhs,..."
7,8,Co-Operation and Agricultural Marketing,"Society,Agricultural, Members,Agricultural, So..."
8,9,Transport and Communication,"Pucca Road,Panchayat Roads,Length of Rural Roa..."
9,10,Education,"No.of High Schools, Boys Toilet, Girls Toilet,..."


In [25]:
texto_data1=new_df['Description'].tolist()
process_text1=cleaning(texto_data1)

cleaned_sent1 =pd.DataFrame(process_text1)
new_df['new_description']=cleaned_sent1

In [26]:
new_df

Unnamed: 0,Chapter_id,Chapter_name,Description,new_description
0,1,General Information,"Urban,with cylinder, Rural,with cylinder, Tota...",urban cylinder rural cylinder total cylinder s...
1,2,Area and Population,"Male,0-14 2011, Female,0-14 2011, Total,0-14 2...",male female total male female total male femal...
2,3,Rainfall,"2009,Actual Annual Rainfall from 2009 to 2019 ...",actual annual rainfall mm actual annual rainfa...
3,4,"Agriculture, Horticulture & Sericulture","Male,Numbers,Marginal Agricultural Land Holder...",male number marginal agricultural land holder ...
4,5,Fisheries,"Indigenous Breed,Cattle, Exotic Breed,Cattle, ...",indigenous breed cattle exotic breed cattle cr...
5,6,Industries,"Working Factories Nos,No. of Working Factories...",working factory no working factory cane crushe...
6,7,Banks,"No. of Branches,Regional Rural Banks In Lakhs,...",branch regional rural bank lakh atm regional r...
7,8,Co-Operation and Agricultural Marketing,"Society,Agricultural, Members,Agricultural, So...",society agricultural member agricultural socie...
8,9,Transport and Communication,"Pucca Road,Panchayat Roads,Length of Rural Roa...",pucca road panchayat road length rural road km...
9,10,Education,"No.of High Schools, Boys Toilet, Girls Toilet,...",high school boy toilet girl toilet electricity...


In [27]:
cleaned_sent

Unnamed: 0,0
0,build resilience poor vulnerable situation red...
1,ensure men woman particular poor vulnerable eq...
2,eradicate extreme poverty people everywhere cu...
3,reduce least half proportion men woman child a...
4,create sound policy framework state level base...
...,...
127,fully operationalize technology bank science t...
128,promote development transfer dissemination dif...
129,strengthen domestic resource mobilization incl...
130,enhance capacity building support increase sig...


In [28]:
all_text = pd.concat([cleaned_sent, cleaned_sent1])
all_text.reset_index(drop=True)

Unnamed: 0,0
0,build resilience poor vulnerable situation red...
1,ensure men woman particular poor vulnerable eq...
2,eradicate extreme poverty people everywhere cu...
3,reduce least half proportion men woman child a...
4,create sound policy framework state level base...
...,...
144,current year beneficiary bhagyalakshmi scheme ...
145,borewells drilled bore well drilled total bore...
146,tota census house census house vacanthouses ce...
147,beneficiries bhagyajyoti kutirajyoti village e...


In [29]:
# Create a TfidfVectorizer instance
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data from df_1 and df_2
tfidf_matrix_1 = tfidf_vectorizer.fit_transform(all_text[0])
tfidf_matrix_2 = tfidf_vectorizer.transform(new_df['new_description'])
print(tfidf_matrix_1.shape)
print(tfidf_matrix_2.shape)


(149, 1611)
(17, 1611)


In [30]:
tfidf_matrix_0 = tfidf_matrix_1[0:132, :] 
tfidf_matrix_0.shape

(132, 1611)

In [31]:
# Compute cosine similarity between vectors of df_1 and df_2
similarity_matrix_sdg = cosine_similarity(tfidf_matrix_2, tfidf_matrix_0)

vec_val = new_df['Chapter_id'].tolist()
sdg_val = sdg_new['Target_id'].tolist()

# Convert cosine similarities to DataFrame
similarity_df_sdg = pd.DataFrame(similarity_matrix_sdg, columns=sdg_val, index=vec_val)


In [32]:
similarity_df_sdg

Unnamed: 0,1.5,1.4,1.1,1.2,1.6,1.3,2.8,2.5,2.3,2.2,...,15.3,15.1,15.5,16.2,16.3,16.5,16.4,16.1,17.1,17.2
1,0.003533,0.003971,0.079481,0.0,0.020326,0.0,0.049752,0.009631,0.041429,0.0,...,0.018858,0.002832,0.0,0.0,0.0,0.0,0.025241,0.048132,0.016659,0.03034
2,0.0,0.03886,0.154769,0.042542,0.0,0.039562,0.002683,0.0,0.050717,0.035423,...,0.030933,0.069965,0.036743,0.0,0.0,0.0,0.048202,0.066856,0.0,0.0
3,0.0,0.0,0.006477,0.0,0.0,0.0,0.0,0.0,0.00169,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001767,0.002451,0.0,0.0
4,0.014983,0.066137,0.002741,0.0,0.011949,0.000155,0.053212,0.012099,0.087631,0.000105,...,0.0,0.042151,0.0,0.018818,0.00019,0.000658,0.020959,0.026391,0.00237,0.019813
5,0.006739,0.0,0.01821,0.0,0.011926,0.0,0.012293,0.095853,0.044948,0.0,...,0.0,0.0,0.0,0.007875,0.0,0.009879,0.019871,0.03921,0.017749,0.018339
6,0.001103,0.013232,0.001834,0.0,0.004231,0.0,0.024967,0.0,0.032297,0.0,...,0.0,0.000884,0.0,0.0,0.0,0.0,0.017912,0.024843,0.004728,0.0
7,0.0,0.01579,0.018202,0.0,0.0,0.0,0.0,0.065072,0.00861,0.0,...,0.0,0.0,0.041027,0.0,0.0,0.087342,0.003887,0.005392,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.023028,0.008652,0.052136,0.0,...,0.0,0.0,0.0,0.0,0.0,0.015703,0.016774,0.023265,0.0,0.0
9,0.009137,0.018258,0.016903,0.007021,0.014181,0.0,0.005358,0.005208,0.012002,0.0,...,0.006093,0.0,0.0,0.010678,0.0108,0.027629,0.010777,0.014947,0.005984,0.0
10,0.0,0.0,0.020155,0.016133,0.001225,0.006023,0.0,0.0,0.016668,0.082729,...,0.0,0.006644,0.010039,0.0,0.0,0.0,0.017427,0.059897,0.008115,0.0


In [33]:
def find_top_n_similarities(similarity_matrix, n, sdg, chapter_df):
    top_n_similarities = []
    top_n_similarity_taget_id = []
    top_n_similarity_goal_id = []
    
    for index, row in similarity_matrix.iterrows():
        top_n_indices = sorted(range(len(row)), key=lambda i: row.values[i], reverse=True)[:n]
        top_n_values = [row.values[i] for i in top_n_indices]
        top_n_target = [similarity_matrix.columns[i] for i in top_n_indices]
        # Fetch Goal_id from sdg DataFrame using Target_id values
        top_n_goals = [sdg['Goal No.'].loc[i] for i in top_n_indices]   
        
        top_n_similarities.append(top_n_values)
        top_n_similarity_taget_id.append(top_n_target)
        top_n_similarity_goal_id.append(top_n_goals)
    
    chapter_df['top_n_similarity_goal_id'] = top_n_similarity_goal_id
    chapter_df['top_n_similarity_target_id'] = top_n_similarity_taget_id
    chapter_df['top_n_similarities'] = top_n_similarities
    
    return chapter_df



In [34]:
# Assuming similarity_matrix is your 18x132 numpy array
n = 3  # Number of top similarities to find
result_lookup = find_top_n_similarities(similarity_df_sdg, n, sdg_new, new_df)

In [35]:
result_lookup

Unnamed: 0,Chapter_id,Chapter_name,Description,new_description,top_n_similarity_goal_id,top_n_similarity_target_id,top_n_similarities
0,1,General Information,"Urban,with cylinder, Rural,with cylinder, Tota...",urban cylinder rural cylinder total cylinder s...,"[11, 1, 8]","[11.8, 1.1, 8.9]","[0.157923117146608, 0.07948052860421147, 0.064..."
1,2,Area and Population,"Male,0-14 2011, Female,0-14 2011, Total,0-14 2...",male female total male female total male femal...,"[11, 4, 10]","[11.8, 4.5, 10.1]","[0.2323934573184987, 0.17249840493091198, 0.15..."
2,3,Rainfall,"2009,Actual Annual Rainfall from 2009 to 2019 ...",actual annual rainfall mm actual annual rainfa...,"[8, 8, 11]","[8.2, 8.1, 11.6]","[0.095461048818236, 0.07765302031127559, 0.062..."
3,4,"Agriculture, Horticulture & Sericulture","Male,Numbers,Marginal Agricultural Land Holder...",male number marginal agricultural land holder ...,"[14, 2, 5]","[14.3, 2.4, 5.7]","[0.3227076488754788, 0.29529472652711486, 0.28..."
4,5,Fisheries,"Indigenous Breed,Cattle, Exotic Breed,Cattle, ...",indigenous breed cattle exotic breed cattle cr...,"[2, 12, 4]","[2.5, 12.8, 4.5]","[0.09585323535678761, 0.07850391606341041, 0.0..."
5,6,Industries,"Working Factories Nos,No. of Working Factories...",working factory no working factory cane crushe...,"[8, 4, 5]","[8.5, 4.5, 5.3]","[0.20345814203500553, 0.0641745525792422, 0.05..."
6,7,Banks,"No. of Branches,Regional Rural Banks In Lakhs,...",branch regional rural bank lakh atm regional r...,"[8, 2, 15]","[8.1, 2.6, 15.6]","[0.17406319691821304, 0.11208654873492278, 0.1..."
7,8,Co-Operation and Agricultural Marketing,"Society,Agricultural, Members,Agricultural, So...",society agricultural member agricultural socie...,"[2, 2, 11]","[2.4, 2.6, 11.3]","[0.10954079083738143, 0.08824982748319578, 0.0..."
8,9,Transport and Communication,"Pucca Road,Panchayat Roads,Length of Rural Roa...",pucca road panchayat road length rural road km...,"[3, 9, 11]","[3.5, 9.1, 11.2]","[0.18990443337377, 0.13222315725616726, 0.0789..."
9,10,Education,"No.of High Schools, Boys Toilet, Girls Toilet,...",high school boy toilet girl toilet electricity...,"[4, 5, 3]","[4.2, 5.2, 3.3]","[0.14812536255151312, 0.12344888029589518, 0.0..."


In [31]:
#result_lookup.to_csv('Lookup_2016-17_TFIDF.csv', sep=";", index=False)

Unwrap lists

In [36]:
# Create a list of dictionaries
lookup_rows = []
for chapter_id, data in result_lookup.iterrows():
    for i in range(len(data['top_n_similarity_target_id'])):
        row = {
            'Chapter_id': data['Chapter_id'],
            'Chapter_name': data['Chapter_name'],
            'Description': data['Description'],
            'new_description': data['new_description'],
            'top_n_similarity_goal_id': data['top_n_similarity_goal_id'][i],
            'top_n_similarity_target_id': data['top_n_similarity_target_id'][i],
            'top_n_similarities': data['top_n_similarities'][i]
        }
        lookup_rows.append(row)


In [37]:
# Convert list of dictionaries to DataFrame
final_lookup = pd.DataFrame(lookup_rows)
final_lookup

Unnamed: 0,Chapter_id,Chapter_name,Description,new_description,top_n_similarity_goal_id,top_n_similarity_target_id,top_n_similarities
0,1,General Information,"Urban,with cylinder, Rural,with cylinder, Tota...",urban cylinder rural cylinder total cylinder s...,11,11.8,0.157923
1,1,General Information,"Urban,with cylinder, Rural,with cylinder, Tota...",urban cylinder rural cylinder total cylinder s...,1,1.1,0.079481
2,1,General Information,"Urban,with cylinder, Rural,with cylinder, Tota...",urban cylinder rural cylinder total cylinder s...,8,8.9,0.064262
3,2,Area and Population,"Male,0-14 2011, Female,0-14 2011, Total,0-14 2...",male female total male female total male femal...,11,11.8,0.232393
4,2,Area and Population,"Male,0-14 2011, Female,0-14 2011, Total,0-14 2...",male female total male female total male femal...,4,4.5,0.172498
5,2,Area and Population,"Male,0-14 2011, Female,0-14 2011, Total,0-14 2...",male female total male female total male femal...,10,10.1,0.155031
6,3,Rainfall,"2009,Actual Annual Rainfall from 2009 to 2019 ...",actual annual rainfall mm actual annual rainfa...,8,8.2,0.095461
7,3,Rainfall,"2009,Actual Annual Rainfall from 2009 to 2019 ...",actual annual rainfall mm actual annual rainfa...,8,8.1,0.077653
8,3,Rainfall,"2009,Actual Annual Rainfall from 2009 to 2019 ...",actual annual rainfall mm actual annual rainfa...,11,11.6,0.062247
9,4,"Agriculture, Horticulture & Sericulture","Male,Numbers,Marginal Agricultural Land Holder...",male number marginal agricultural land holder ...,14,14.3,0.322708


In [38]:
final_lookup.shape

(51, 7)

In [81]:
#final_lookup.to_csv('Lookup_2016-17_TFIDF_expanded.csv', sep=";", index=False)

Goal - Chapter Similarity

In [39]:
similarity_df_sdg_trans = similarity_df_sdg.T

In [40]:
def find_similarity_sdg(similarity_matrix, threshold, chapter_df, sdg):
    top_n_similarities = []
    top_n_similarity_chapter_id = []
    top_n_similarity_chapter_name = []
    n = threshold
    for index, row in similarity_matrix.iterrows():
        top_n_indices = sorted(range(len(row)), key=lambda i: row.values[i], reverse=True)[:n]
        top_n_values = [row.values[i] for i in top_n_indices]
        top_n_chapter = [similarity_matrix.columns[i] for i in top_n_indices]
        top_n_chapter_name = [new_df['Chapter_name'].loc[i] for i in top_n_indices]   
        
        top_n_similarities.append(top_n_values)
        top_n_similarity_chapter_id.append(top_n_chapter)
        top_n_similarity_chapter_name.append(top_n_chapter_name)
    
    sdg['top_n_similarity_chapter_name'] = top_n_similarity_chapter_name
    sdg['top_n_similarity_chapter_id'] = top_n_similarity_chapter_id
    sdg['top_n_similarities'] = top_n_similarities

    return sdg

In [41]:
# Example usage:
threshold_value = 3  # Define the threshold value

# Call the function to find chapter_ids with similarity greater than the threshold
result_sdg = find_similarity_sdg(similarity_df_sdg_trans, threshold_value, new_df, sdg_new)


In [42]:
result_sdg.head(10)

Unnamed: 0,Goal No.,Goal,Targets,Target_id,Tentative Indicators,new_targets,top_n_similarity_chapter_name,top_n_similarity_chapter_id,top_n_similarities
0,1,End poverty in all its forms everywhere,"By 2030, build the resilience of the poor and ...",1.5,"By 2030, build the resilience of the poor and ...",build resilience poor vulnerable situation red...,"[Other Information, Health & Family Welfare Se...","[17, 11, 4]","[0.10870508744286651, 0.0393113751777395, 0.01..."
1,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",1.4,"By 2030, ensure that all men and women, in par...",ensure men woman particular poor vulnerable eq...,"[Agriculture, Horticulture & Sericulture, Area...","[4, 2, 14]","[0.06613657495546826, 0.03886038848573475, 0.0..."
2,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",1.1,"By 2030, eradicate extreme poverty for all peo...",eradicate extreme poverty people everywhere cu...,"[Area and Population, Rural Development and Pa...","[2, 14, 1]","[0.15476874822527176, 0.09068401386993723, 0.0..."
3,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",1.2,"By 2030, reduce at least by half the proportio...",reduce least half proportion men woman child a...,"[Women & Child Development, Other Information,...","[13, 17, 2]","[0.07441870405196868, 0.044361166895772054, 0...."
4,1,End poverty in all its forms everywhere,Create sound policy frameworks at the state le...,1.6,Create sound policy frameworks at the state le...,create sound policy framework state level base...,"[General Information, Rural Development and Pa...","[1, 14, 9]","[0.020326179910114164, 0.019298878866827254, 0..."
5,1,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,1.3,Implement nationally appropriate social protec...,implement nationally appropriate social protec...,"[Area and Population, Women & Child Developmen...","[2, 13, 17]","[0.039561980985016856, 0.03640826296654204, 0...."
6,2,"End hunger, achieve food security and improved...",Adopt measures to ensure the proper functionin...,2.8,Adopt measures to ensure the proper functionin...,adopt measure ensure proper functioning food c...,"[Agriculture, Horticulture & Sericulture, Wome...","[4, 13, 1]","[0.05321219018019437, 0.05200241313109652, 0.0..."
7,2,"End hunger, achieve food security and improved...","By 2020, maintain the genetic diversity of see...",2.5,"By 2020, maintain the genetic diversity of see...",maintain genetic diversity seed cultivated pla...,"[Fisheries, Banks, Other Information]","[5, 7, 17]","[0.09585323535678761, 0.0650723309001409, 0.02..."
8,2,"End hunger, achieve food security and improved...","By 2030, double the agricultural productivity ...",2.3,"By 2030, double the agricultural productivity ...",double agricultural productivity income small ...,"[Agriculture, Horticulture & Sericulture, Ener...","[4, 16, 8]","[0.08763072299198071, 0.07688641458656917, 0.0..."
9,2,"End hunger, achieve food security and improved...","By 2030, end all forms of malnutrition, includ...",2.2,"By 2030, end all forms of malnutrition, includ...",end form malnutrition including achieving inte...,"[Women & Child Development, Education, Social ...","[13, 10, 12]","[0.10452473156146246, 0.08272896000536745, 0.0..."


In [43]:
# Create a dictionary to store chapter details
chapter_details = {}

# Iterate over the rows of the SDG DataFrame
for index, row in result_sdg.iterrows():
    chapter_id = row['top_n_similarity_chapter_id']
    values = row['top_n_similarities']
    goal_no = row['Goal No.']
    target_id = row['Target_id']
    #print(chapter_id, values, goal_no, target_id)
    for ids in range(len(chapter_id)):
        #print(ids)
        
        #print(chapter_id[ids])
        #if index<9:
            #print(chapter_details)
        if chapter_id[ids] not in chapter_details:
            chapter_details[chapter_id[ids]] = { 'Chapter_id' : chapter_id[ids], 'top_n_similarity_goal_id' : [goal_no], 'top_n_similarity_target_id': [target_id], 'top_n_similarities' : [values[ids]] }
            #print(chapter_details[chapter_id[ids]])
        else:
            #print(chapter_details[chapter_id[ids]])
            g_id = chapter_details[chapter_id[ids]].get('top_n_similarity_goal_id')
            t_id = chapter_details[chapter_id[ids]].get('top_n_similarity_target_id')
            val = chapter_details[chapter_id[ids]].get('top_n_similarities')
            g_id.append(goal_no)
            #print(g_id)
            t_id.append(target_id)
            val.append(values[ids])
            #new_g_id = list(set(g_id))
            #print(new_g_id)
            #new_t_id = list(set(t_id))
            chapter_details[chapter_id[ids]].update({'top_n_similarity_goal_id' : g_id})
            chapter_details[chapter_id[ids]].update({'top_n_similarity_target_id' : t_id})
            chapter_details[chapter_id[ids]].update({'top_n_similarities' : val})
            

In [44]:
chapter_details

{17: {'Chapter_id': 17,
  'top_n_similarity_goal_id': [1,
   1,
   1,
   2,
   3,
   3,
   3,
   3,
   3,
   3,
   6,
   8,
   8,
   8,
   9,
   9,
   10,
   10,
   11,
   12,
   12,
   12,
   13,
   13,
   14,
   15,
   15,
   15,
   15,
   17],
  'top_n_similarity_target_id': [1.5,
   1.2,
   1.3,
   2.5,
   3.7,
   3.5,
   3.2,
   3.6,
   3.1,
   3.12,
   6.8,
   8.6,
   8.1,
   8.1,
   9.4,
   9.6,
   10.2,
   10.3,
   11.4,
   12.2,
   12.4,
   12.6,
   13.2,
   13.1,
   14.9,
   15.8,
   15.9,
   15.7,
   15.1,
   17.1],
  'top_n_similarities': [0.10870508744286651,
   0.044361166895772054,
   0.01889432740569135,
   0.022629805845145144,
   0.018306433882522132,
   0.04634784112509026,
   0.09891899806926391,
   0.04104316637128463,
   0.09871496028688698,
   0.02234409133726035,
   0.05868626377406672,
   0.0035027148724235367,
   0.056153155351963885,
   0.015452731726265259,
   0.010472675103065964,
   0.005886764423653437,
   0.021412492255437364,
   0.0021453870115744644,
 

In [45]:
# Convert dictionary to list of dictionaries
rows = []
for chapter_id, values in chapter_details.items():
    row = {'Chapter_id': chapter_id,
           'top_n_similarity_goal_id': values['top_n_similarity_goal_id'],
           'top_n_similarity_target_id': values['top_n_similarity_target_id'],
           'top_n_similarities': values['top_n_similarities']}
    rows.append(row)

# Create DataFrame
goals = pd.DataFrame(rows)


In [46]:
goals

Unnamed: 0,Chapter_id,top_n_similarity_goal_id,top_n_similarity_target_id,top_n_similarities
0,17,"[1, 1, 1, 2, 3, 3, 3, 3, 3, 3, 6, 8, 8, 8, 9, ...","[1.5, 1.2, 1.3, 2.5, 3.7, 3.5, 3.2, 3.6, 3.1, ...","[0.10870508744286651, 0.044361166895772054, 0...."
1,11,"[1, 3, 3, 3, 3, 3, 11, 13, 15, 15, 16]","[1.5, 3.5, 3.2, 3.1, 3.8, 3.11, 11.5, 13.1, 15...","[0.0393113751777395, 0.07154823807231261, 0.05..."
2,4,"[1, 1, 2, 2, 2, 2, 2, 4, 5, 7, 8, 9, 9, 11, 11...","[1.5, 1.4, 2.8, 2.3, 2.4, 2.7, 2.6, 4.4, 5.7, ...","[0.014983367160861686, 0.06613657495546826, 0...."
3,2,"[1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4, 5, 5, ...","[1.4, 1.1, 1.2, 1.3, 2.1, 2.7, 3.7, 3.3, 3.1, ...","[0.03886038848573475, 0.15476874822527176, 0.0..."
4,14,"[1, 1, 1, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 6, ...","[1.4, 1.1, 1.6, 3.8, 3.11, 4.8, 4.9, 4.3, 4.6,...","[0.03375656975226316, 0.09068401386993723, 0.0..."
5,1,"[1, 1, 2, 3, 3, 3, 6, 6, 7, 7, 8, 8, 9, 9, 10,...","[1.1, 1.6, 2.8, 3.12, 3.9, 3.1, 6.3, 6.8, 7.3,...","[0.07948052860421147, 0.020326179910114164, 0...."
6,13,"[1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, ...","[1.2, 1.3, 2.8, 2.2, 2.4, 3.7, 3.6, 3.9, 3.4, ...","[0.07441870405196868, 0.03640826296654204, 0.0..."
7,9,"[1, 3, 4, 4, 5, 6, 6, 9, 9, 11, 11, 11, 12, 12...","[1.6, 3.5, 4.9, 4.4, 5.8, 6.5, 6.3, 9.1, 9.8, ...","[0.014181057021808618, 0.18990443337377, 0.030..."
8,5,"[2, 3, 3, 4, 5, 6, 8, 9, 12, 13, 13, 13, 14, 1...","[2.5, 3.6, 3.12, 4.5, 5.4, 6.6, 8.4, 9.8, 12.8...","[0.09585323535678761, 0.02806293201751614, 0.0..."
9,7,"[2, 2, 3, 3, 4, 6, 8, 8, 9, 9, 9, 9, 11, 11, 1...","[2.5, 2.6, 3.3, 3.1, 4.9, 6.4, 8.2, 8.1, 9.4, ...","[0.0650723309001409, 0.11208654873492278, 0.04..."


In [47]:
# Convert dictionary to list of dictionaries
all_rows = []
for chapter_id, values in chapter_details.items():
    for i in range(len(values['top_n_similarity_goal_id'])):
        row = {'Chapter_id': values['Chapter_id'],
               'top_n_similarity_goal_id': values['top_n_similarity_goal_id'][i],
               'top_n_similarity_target_id': values['top_n_similarity_target_id'][i],
               'top_n_similarities': values['top_n_similarities'][i]}
        all_rows.append(row)


In [48]:
# Create DataFrame
all_goals = pd.DataFrame(all_rows)

In [49]:
all_goals

Unnamed: 0,Chapter_id,top_n_similarity_goal_id,top_n_similarity_target_id,top_n_similarities
0,17,1,1.5,0.108705
1,17,1,1.2,0.044361
2,17,1,1.3,0.018894
3,17,2,2.5,0.022630
4,17,3,3.7,0.018306
...,...,...,...,...
391,3,8,8.2,0.095461
392,3,8,8.1,0.077653
393,3,10,10.4,0.000000
394,3,11,11.6,0.062247


In [50]:
#creating dictionary for movie id
chapter_names_dict = dict(zip(new_df['Chapter_id'], new_df['Chapter_name']))

In [51]:
all_goals['Chapter_name'] = all_goals['Chapter_id'].map(chapter_names_dict.get)

In [52]:
goal_name = dict(zip(sdg_new['Goal No.'], sdg_new['Goal']))

In [53]:
target_name = dict(zip(sdg_new['Target_id'], sdg_new['Targets']))

In [54]:
all_goals['Goal'] = all_goals['top_n_similarity_goal_id'].map(goal_name.get)

In [55]:
all_goals['Targets'] = all_goals['top_n_similarity_target_id'].map(target_name.get)

In [56]:
all_goals.columns

Index(['Chapter_id', 'top_n_similarity_goal_id', 'top_n_similarity_target_id',
       'top_n_similarities', 'Chapter_name', 'Goal', 'Targets'],
      dtype='object')

In [57]:
all_goals = all_goals.loc[:,['Chapter_id', 'Chapter_name', 'Goal', 'Targets', 'top_n_similarity_goal_id', 'top_n_similarity_target_id','top_n_similarities']]

In [58]:
all_goals

Unnamed: 0,Chapter_id,Chapter_name,Goal,Targets,top_n_similarity_goal_id,top_n_similarity_target_id,top_n_similarities
0,17,Other Information,End poverty in all its forms everywhere,"By 2030, build the resilience of the poor and ...",1,1.5,0.108705
1,17,Other Information,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",1,1.2,0.044361
2,17,Other Information,End poverty in all its forms everywhere,Implement nationally appropriate social protec...,1,1.3,0.018894
3,17,Other Information,"End hunger, achieve food security and improved...","By 2020, maintain the genetic diversity of see...",2,2.5,0.022630
4,17,Other Information,Ensure healthy lives and promote well-being fo...,"Achieve universal health coverage, including f...",3,3.7,0.018306
...,...,...,...,...,...,...,...
391,3,Rainfall,"Promote sustained, inclusive and sustainable e...",Achieve higher levels of economic productivity...,8,8.2,0.095461
392,3,Rainfall,"Promote sustained, inclusive and sustainable e...",Sustain per capita economic growth in accordan...,8,8.1,0.077653
393,3,Rainfall,Reduce inequality within the State,"Adopt policies, especially fiscal, wage and so...",10,10.4,0.000000
394,3,Rainfall,"Make cities and human settlements inclusive, s...","By 2030, reduce the adverse per capita environ...",11,11.6,0.062247


In [59]:
all_goals.to_csv('Lookup_2019-20.csv', sep=";", index=False)

In [62]:
#let's create the vectorizer and fit the corpus and transform them accordingly
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)

In [63]:
#let's print the vocabulary

print(v.vocabulary_)



In [64]:
#let's print the idf of each word:

all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    
    #let's get the index in the vocabulary
    indx = v.vocabulary_.get(word)
    
    #get the score
    idf_score = v.idf_[indx]
    
    print(f"{word} : {idf_score}")

abuse : 4.799227511282801
accelerated : 5.204692619390966
accepted : 5.204692619390966
access : 2.463852595465765
accessible : 4.799227511282801
accident : 5.204692619390966
accordance : 3.2587824703356527
according : 4.799227511282801
account : 4.511545438831021
accountable : 5.204692619390966
achieve : 3.332890442489375
achieving : 4.511545438831021
acquire : 5.204692619390966
across : 4.799227511282801
action : 3.700615222614692
activity : 4.799227511282801
adaptation : 4.288401887516811
adapted : 5.204692619390966
adaptive : 5.204692619390966
added : 4.106080330722856
addition : 4.799227511282801
address : 4.511545438831021
adequate : 4.288401887516811
adolescent : 4.799227511282801
adopt : 4.106080330722856
adopted : 5.204692619390966
adopting : 5.204692619390966
adoption : 4.799227511282801
adult : 4.288401887516811
advance : 5.204692619390966
advanced : 5.204692619390966
adverse : 4.799227511282801
affect : 5.204692619390966
affected : 4.511545438831021
affirms : 5.2046926193909

revenue : 5.204692619390966
review : 5.204692619390966
right : 3.8183982582710754
risk : 4.106080330722856
river : 5.204692619390966
road : 4.511545438831021
round : 4.799227511282801
rti : 5.204692619390966
rule : 5.204692619390966
run : 5.204692619390966
rural : 4.106080330722856
safe : 3.499944527152541
safeguard : 4.799227511282801
safely : 4.511545438831021
safety : 5.204692619390966
sanitation : 4.288401887516811
satisfied : 4.799227511282801
scale : 4.511545438831021
scarcity : 5.204692619390966
scheme : 5.204692619390966
scholarship : 5.204692619390966
school : 4.799227511282801
science : 4.511545438831021
scientific : 4.511545438831021
season : 5.204692619390966
seat : 5.204692619390966
secondary : 4.288401887516811
sector : 3.951929650895598
secure : 4.511545438831021
seed : 5.204692619390966
seized : 5.204692619390966
selection : 5.204692619390966
sendai : 5.204692619390966
sensitive : 4.799227511282801
service : 2.7623455840217614
settlement : 4.511545438831021
severe : 5.2

In [65]:
#let's print the transformed output from tf-idf
print(transform_output.shape)

(133, 1128)


In [66]:
type(transform_output)

scipy.sparse._csr.csr_matrix

In [29]:
sdg_new_copy = sdg_new.copy()

In [30]:
y = sdg_new_copy[['Goal No.']]

In [32]:
from sklearn.model_selection import GridSearchCV #importing GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB


In [48]:
# Create transformers for text features
text_transformer = TfidfVectorizer()

# Create a column transformer to apply transformers to text and numerical columns only
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'new_targets')],
    remainder='drop'  # Drop any other columns not specified above
)

# Create a pipeline with the column transformer and Multinomial Naive Bayes
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', MultinomialNB())
])

# Define the parameter grid with different values for max_features
param_grid = {
    'preprocessor__text__max_features': [500, 1000, 5000],  
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(sdg_new_copy, y)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [49]:
print("Best Max Features:", grid_search.best_params_['preprocessor__text__max_features'])
print("Best Accuracy:", grid_search.best_score_)

Best Max Features: 500
Best Accuracy: 0.41396011396011395


In [51]:
# Create transformers for text features with specified max features
text_transformer1 = TfidfVectorizer(max_features=500)

# Create a column transformer to apply transformers to text and numerical columns only
preprocessor1 = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'new_targets')],
    remainder='drop' # Drop any other columns not specified above
)

# Create a pipeline with the column transformer and Logistic Regression
pipeline1 = Pipeline([
    ('preprocessor', preprocessor1),
    ('clf', LogisticRegression(solver='saga', max_iter=1000))
])

# Define the parameter grid with different values for C (regularization parameter)
param_grid1 = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]  
}

# Perform grid search with cross-validation
grid_search1 = GridSearchCV(pipeline1, param_grid1, cv=5, scoring='accuracy')
grid_search1.fit(sdg_new_copy, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


In [52]:
print("Best Parameters:", grid_search1.best_params_)        
print("Best Accuracy:", grid_search1.best_score_)           

Best Parameters: {'clf__C': 10}
Best Accuracy: 0.5037037037037038


In [30]:
sdg_new['vec'] = sdg_new['tokens'].apply(sent_vec)

In [31]:
sdg_new

Unnamed: 0,Goal No.,Goal,Targets,Target_id,Tentative Indicators,new_targets,tokens,vec
0,1,End poverty in all its forms everywhere,"By 2030, build the resilience of the poor and ...",1.5,"By 2030, build the resilience of the poor and ...","[2030, build, resilience, poor, vulnerable, si...","[situation, vulnerable, miss, extreme, evacuat...","[0.05813743954613095, 0.062311808268229164, -0..."
1,1,End poverty in all its forms everywhere,"By 2030, ensure that all men and women, in par...",1.4,"By 2030, ensure that all men and women, in par...","[2030, ensure, man, woman, particular, poor, v...","[woman, economic, appropriate, microfinance, p...","[0.07795173891129033, 0.0019354051159274194, -..."
2,1,End poverty in all its forms everywhere,"By 2030, eradicate extreme poverty for all peo...",1.1,"By 2030, eradicate extreme poverty for all peo...","[2030, eradicate, extreme, poverty, people, cu...","[population, international, day, sex, eradicat...","[0.021107549252717392, 0.0068206787109375, -0...."
3,1,End poverty in all its forms everywhere,"By 2030, reduce at least by half the proportio...",1.2,"By 2030, reduce at least by half the proportio...","[2030, reduce, half, proportion, man, woman, c...","[woman, population, sex, dimension, disaggrega...","[0.05279862253289474, 0.02122096011513158, 0.0..."
4,1,End poverty in all its forms everywhere,Create sound policy frameworks at the state le...,1.6,Create sound policy frameworks at the state le...,"[create, sound, policy, framework, state, leve...","[multilateral, sustainably, policy, eradicate,...","[-0.0144439697265625, 0.039288838704427086, 0...."
...,...,...,...,...,...,...,...,...
128,16,Strengthen the means of implementation and rev...,Fully operationalize the technology bank and s...,16.5,Fully operationalize the technology bank and s...,"[fully, operationalize, technology, bank, scie...","[enhance, innovation, enable, science, capacit...","[-0.01899157072368421, -0.030575400904605265, ..."
129,16,Strengthen the means of implementation and rev...,"Promote the development, transfer, disseminati...",16.4,"Promote the development, transfer, disseminati...","[promote, development, transfer, dissemination...","[funding, promote, environmentally, disseminat...","[-0.04981231689453125, 0.037671407063802086, 0..."
130,16,Strengthen the means of implementation and rev...,"Strengthen domestic resource mobilization, inc...",16.1,"Strengthen domestic resource mobilization, inc...","[strengthen, domestic, resource, mobilization,...","[international, capacity, percentage, domestic...","[-0.04270765516493055, 0.016059027777777776, 0..."
131,17,"Data, monitoring and accountability","By 2020, enhance capacity-building support to ...",7.1,"By 2020, enhance capacity-building support to ...","[2020, enhance, capacity, build, support, incr...","[enhance, disaggregation, reliable, target, re...","[-0.012115478515625, -0.04031553722563244, -0...."


In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Sample DataFrames with text columns
data_1 = {
    'id': [1, 2, 3],
    'new_target': ['This is the first document.', 'This document is the second document.', 'And this is the third one.']
}

data_2 = {
    'id': [4, 5, 6],
    'description': ['This is the fourth document.', 'This document is the fifth document.', 'And this is the sixth one.']
}

df_1 = pd.DataFrame(data_1)
df_2 = pd.DataFrame(data_2)

# Create a TfidfVectorizer instance
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data from df_1 and df_2
tfidf_matrix_1 = tfidf_vectorizer.fit_transform(df_1['new_target'])
tfidf_matrix_2 = tfidf_vectorizer.transform(df_2['description'])

# Compute cosine similarity between vectors of df_1 and df_2
cosine_similarities = cosine_similarity(tfidf_matrix_1, tfidf_matrix_2)

# Convert cosine similarities to DataFrame
cosine_similarities_df = pd.DataFrame(cosine_similarities, columns=df_2['id'], index=df_1['id'])

print(cosine_similarities_df)


id         4         5         6
id                              
1   0.786785  0.741891  0.370065
2   0.827774  0.877865  0.287134
3   0.408115  0.283804  0.867682


In [52]:
import pandas as pd

# Sample dictionary
data = {
    17: {'Chapter_id': 17, 'top_n_similarity_goal_id': [1, 3, 3], 'top_n_similarity_target_id': [1.5, 3.7, 3.5]},
    9: {'Chapter_id': 9, 'top_n_similarity_goal_id': [1, 1, 3], 'top_n_similarity_target_id': [1.5, 1.6, 3.5]}
}

# Convert dictionary to list of dictionaries
rows = []
for chapter_id, values in data.items():
    for i in range(len(values['top_n_similarity_goal_id'])):
        row = {'Chapter_id': values['Chapter_id'],
               'top_n_similarity_goal_id': values['top_n_similarity_goal_id'][i],
               'top_n_similarity_target_id': values['top_n_similarity_target_id'][i]}
        rows.append(row)

# Create DataFrame
df = pd.DataFrame(rows)
print(df)


   Chapter_id  top_n_similarity_goal_id  top_n_similarity_target_id
0          17                         1                         1.5
1          17                         3                         3.7
2          17                         3                         3.5
3           9                         1                         1.5
4           9                         1                         1.6
5           9                         3                         3.5
