# Import libraries

In [None]:
import re
import subprocess
import glob
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 

# Defining file paths

In [None]:
keywordsFile = "../input/sdgs-keywords/terms.txt"

In [None]:
filePathResultsCSV = './CSV/'
filePathResultsFigs = './figs/'

In [None]:
filepathCSVSumNorm = filePathResultsCSV +'sumNorm.csv'
filepathCSVAverageNorm = filePathResultsCSV + 'averageNorm.csv'

In [None]:
filepathHMSumNorm = filePathResultsFigs + 'sumNorm.png'
filepathHMAverageNorm =filePathResultsFigs +'averageNorm.png'

# Housekeeping between runs

Deletes all output between the runs; Kaggle requires this.

In [None]:
import os
import shutil 
try:
    shutil.rmtree(filePathResultsCSV)
    shutil.rmtree(filePathResultsFigs)
except:
    print("Directories missing")
os.mkdir(filePathResultsCSV)
os.mkdir(filePathResultsFigs)

# Lemming and Stemming Functions

In [None]:
def termsTokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    #words = [token.lem() for token in tokens]
    return words


In [None]:
def termsStemming(searchTerms):
    searchTermsStemmed = [None] * len(searchTerms)
    for i in range(len(searchTerms)):
        if (len(termsTokenizer(searchTerms[i])) == 1): #If only one word
            searchTermsStemmed[i] = termsTokenizer(searchTerms[i])[0]
        else:
            searchTermsStemmed[i] = str(termsTokenizer(searchTerms[i])[0]) + " " + termsTokenizer(searchTerms[i])[1]
    return searchTermsStemmed

# Search Terms Extraction

The search terms are given as a LaTeX table saved in a plaintext file format. We need to separate the search terms:
> Term's category code & Term's category name & search term foo, bar \\\

For example:
> G9 & Industry, Innovation, Infrastructure & technological innovations \\\

In [None]:
def termsSeperator(rawTerms):
    termsArray = []
    for term in rawTerms:
        if term is rawTerms[len(rawTerms)-1]: # quick fix for the '\\\\' chars at the end
            term = term.replace("\\\\", "") 
            term = term.replace("\\", "") 
        term = term.strip()
        termsArray.append(term)
    return termsArray

In [None]:
def pdfFiletypeTrimmer(filename):
    return filename.replace(".pdf","")

# TFIDF Vectors

In [None]:
def getTIDFVectorsAsArray(tfidf_vectorizer_vectors):
    return tfidf_vectorizer_vectors.toarray()

In [None]:
def getTDIDFVector(vectors,index):
    return getTIDFVectorsAsArray(vectors)[index]

In [None]:
def mapTFIDFLabelsValues(vectorizer, vectors, index):
    return dict(zip(vectorizer.get_feature_names(), getTDIDFVector(vectors,index)))

In [None]:
def getTDIDFMatrix(tfidf_vectorizer_vectors, i):
    return tfidf_vectorizer_vectors[i] .T.todense()

# Util Functions

In [None]:
def getDFWithSearchTerms(df, searchTerms):
    return df.loc[df.index.isin(searchTermsStemmed)]  

In [None]:
def filterDictionary(dictionary, filterTerm):
    return { k:v for k, v in dictionary.items() if k in filterTerm}

# Main Program

Init lematizer and stemmers. Also, load the the stopwords to be used.

In [None]:
stopWords = set(stopwords.words('english'))

## Corpus Reading & Processing

In [None]:
pdfFiles = glob.glob("../input/policyguidelines/*.pdf")
pdfFiles += glob.glob("../input/policyguidelines/*.PDF")

Sorting based on file name. 

In [None]:
pdfFiles = sorted(pdfFiles)
print(pdfFiles)

Create the corpus by converting PDFs into plaintext

In [None]:
corpus = {}
for pdfPath in pdfFiles:
    command  = ['pdftotext','-layout', pdfPath, '-']
    output   =  subprocess.check_output(command).decode()
    pdfName = pdfPath.split("/")
    pdfName = pdfName[len(pdfName)-1]
    corpus[pdfFiletypeTrimmer(pdfName)] = output

Remove whitespaces and seperators.

In [None]:
for key, value in corpus.items():    
    corpus[key] = value.lower()
    corpus[key] = re.sub(r'\W',' ',value)
    corpus[key] = re.sub(r'\s+',' ',value)

## Reading our search terms

Import and process our SDG-related terms.

In [None]:
searchCategoriesDic = {}
searchDic = {}

with open(keywordsFile, encoding='utf8') as termsFile:
    for rawLine in termsFile:
        rawLine = rawLine.split("&")
        categoryCode = rawLine[0].replace(" ","")#assuming that the file is formatted properly :D
        name = rawLine[1] 
        rawTerms = rawLine[2].split(",")
        searchCategoriesDic[categoryCode] = name
        searchDic[categoryCode] = termsSeperator(rawTerms)
        

In [None]:
for key, value in searchDic.items():
    searchCategoriesDic[key].replace("  ","")
    searchDic[key] = searchDic[key] + searchCategoriesDic[key].split(", ") 
    print(searchDic[key])

## Constructing our TD-IDF Matrix

Init the vectorizer and vectors, the former contains all the labels and the other the values.

In [None]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True, analyzer='word', ngram_range=(1,2), min_df = 0, stop_words = 'english', tokenizer=termsTokenizer)
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(corpus.values()) # just send in all your docs here 

Mapping labels to values and storing them in our `tfidfDic` dictionary. The structure is: `{<corpusFileName> : {<searchCategoryCode> : <searchTerms>}}`

Note: The `searchTerms` are stemmed.If `filterDictionary(tfidfValues,termsStemming(terms))` returns an empty dictionary, then there are no values!

In [None]:
tfidfDic={}
for i, corpusName in enumerate(corpus):
    tfidfValues = dict(zip(tfidf_vectorizer.get_feature_names(), tfidf_vectorizer_vectors.toarray()[i]))
    tempDic = {}
    print(corpusName)
    for code, terms in searchDic.items():    
        tempDic[code] = filterDictionary(tfidfValues,termsStemming(terms))
        print(tempDic[code])
    tfidfDic[corpusName] = tempDic
    

## Results gathering

Produces the results for each keyword individually.

In [None]:
dfsPerFile = []
for fileName, searchTermsDic in tfidfDic.items():
    dfsSearchCategory = []
    for searchCategory in searchTermsDic.values():
        df = pd.DataFrame.from_dict(searchCategory, columns=[fileName], orient='index')
        dfsSearchCategory.append(df)
    categoryDF = pd.concat(dfsSearchCategory)
    dfsPerFile.append(categoryDF)

if len(dfsPerFile) > 1:
    resultsDF = pd.concat(dfsPerFile, axis=1)

Results aggregated per sum per category and average per category

In [None]:
categorySumResults = {}
categoryAverageResults = {}
fileSumDFArray = []
fileAverageDFArray = []

for fileName, searchTermsDic in tfidfDic.items():
    for categoryCode, searchCategory in searchTermsDic.items():
        average = 0
        currentSum = 0
        for tdidfValue in searchCategory.values():
            currentSum = tdidfValue + currentSum
        if len(searchCategory.values())!= 0:
            average = currentSum / len(searchCategory.values())   
        else:
            average = 0
        categorySumResults[categoryCode] = currentSum
        categoryAverageResults[categoryCode] = average
    fileSumDFArray.append(pd.DataFrame.from_dict(categorySumResults, columns=[fileName], orient='index'))
    fileAverageDFArray.append(pd.DataFrame.from_dict(categoryAverageResults, columns=[fileName], orient='index'))

if len(fileSumDFArray) > 1:
    sumDF = pd.concat(fileSumDFArray, axis=1)
if len(fileAverageDFArray) > 1:
    averageDF = pd.concat(fileAverageDFArray, axis=1)

**Normalisation** of the sum and average values by a factor of 100:

In [None]:
if not sumDF.empty:
    normalisedSumDF = sumDF
    normalisedSumDF[normalisedSumDF.select_dtypes(include=['number']).columns] *= 100    
if not averageDF.empty:
    normalisedAverageDF = averageDF
    normalisedAverageDF[normalisedAverageDF.select_dtypes(include=['number']).columns] *= 100

## Export Results

### Export to CSV

Create the csv with the sum value.

In [None]:
if not normalisedSumDF.empty:
    normalisedSumDF.to_csv(filepathCSVSumNorm, index=True, header=True)

Create the csv with the average value.

In [None]:
if not normalisedAverageDF.empty:
    normalisedAverageDF.to_csv(filepathCSVAverageNorm, index=True, header=True)

### Export to Heatmap

Create the heatmap with the sum value.

In [None]:
plt.figure(figsize = (15,8))

sns.heatmap(normalisedSumDF, annot=True,  cmap="YlGnBu")
plt.savefig(filepathHMSumNorm, dpi=300)

Create the heatmap with the average value.

In [None]:
plt.figure(figsize = (15,8))

sns.heatmap(normalisedAverageDF, annot=True, cmap="YlGnBu")
plt.savefig(filepathHMAverageNorm, dpi=300)