# Loading the cleaned data

In [3]:
import os
from pathlib import Path
import pandas as pd

In [4]:
DirPpath = Path(os.path.abspath('')).parent # Fetching the current directory path - Specific for ipynb file - For .py: Path(os.path.dirname(os.path.realpath(__file__)).replace("\\", "/"))

PledgesCsvPath = str(DirPpath.absolute()) + "/CleanedData.csv"  

PledgesDf = pd.read_csv(PledgesCsvPath, index_col=0)

# Preprocessing the texts

In [6]:
import string
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [7]:
def first_clean(text):
    return " ".join(text.split()).replace("_x000D_","")

first_clean(PledgesDf.iloc[3,1]) # Delete the returns from excel

'Achieve sustainable and flexible solutions for multimodal transport and develop policies to protect natural heritage and biodiversity, respecting the socio-cultural authenticity of host communities.  CNA Turismo e Commercio, over the three-year period (between autumn 2022 and throughout 2023), will organize training seminars for businesses - with the involvement of public and private stakeholders - aimed at the implementation of concrete solutions for the development of good practices for a supply of multimodal transport and protocols for the respect and protection of the natural heritage and biodiversity.'

In [8]:
def preprocess(text):
    text = text.lower() # Lowercase all the characters from the string
    text = text.strip() # Remove the leading and trailing whitespaces
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text) # Removing Punctuation
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text)) # Remove non alphanumeric characters
    text = re.sub(r'\d', '', text) # Removing digits
    text = re.sub(r'\s+', ' ', str(text).strip()) # Replacing "double, triple, etc" whitespaces by one
    return text

preprocess(first_clean(PledgesDf.iloc[3,1])) # Testing on pledge 4

'achieve sustainable and flexible solutions for multimodal transport and develop policies to protect natural heritage and biodiversity respecting the socio cultural authenticity of host communities cna turismo e commercio over the three year period between autumn and throughout will organize training seminars for businesses with the involvement of public and private stakeholders aimed at the implementation of concrete solutions for the development of good practices for a supply of multimodal transport and protocols for the respect and protection of the natural heritage and biodiversity'

In [10]:
def stopword(string):
    a = [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

stopword(preprocess(first_clean(PledgesDf.iloc[3,1])))

'achieve sustainable flexible solutions multimodal transport develop policies protect natural heritage biodiversity respecting socio cultural authenticity host communities cna turismo e commercio three year period autumn throughout organize training seminars businesses involvement public private stakeholders aimed implementation concrete solutions development good practices supply multimodal transport protocols respect protection natural heritage biodiversity'

In [11]:
# LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()


# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(nltk.word_tokenize(string))  # Get position tags
    a = [wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in
         enumerate(word_pos_tags)]  # Map the position tag and lemmatize the word/token
    return " ".join(a)

lemmatizer(stopword(preprocess(first_clean(PledgesDf.iloc[3,1]))))

'achieve sustainable flexible solution multimodal transport develop policy protect natural heritage biodiversity respect socio cultural authenticity host community cna turismo e commercio three year period autumn throughout organize training seminar business involvement public private stakeholder aim implementation concrete solution development good practice supply multimodal transport protocol respect protection natural heritage biodiversity'

In [12]:
def preprocessing(text):

    global n
    n = n +1
    
    print("**************")
    print("n is : ")
    print(n)
    print("length of the text is : ")
    print(len(first_clean(text)))
    return lemmatizer(stopword(preprocess(first_clean(text))))

In [None]:
n=0
print("begin pre-processing")
PledgesDf['clean_text'] = PledgesDf['Pledge'].apply(lambda x: preprocessing(x))
print("end pre-processing")

# Analysis of pre-processed data

In [None]:
pd.Series(" ".join(PledgesDf['clean_text']).split()).value_counts()[:30].index.tolist() # Most frequent words

In [None]:
StopWords2 = pd.Series(" ".join(PledgesDf['clean_text']).split()).value_counts()[:30].index.tolist()

def RemoveFrequentWords(string, FrequentWords):
    a = [i for i in string.split() if i not in FrequentWords] # Removing usual english stopwords from the string
    return ' '.join(a) #Output - Same string after all the transformations

PledgesDf['clean_text'] = PledgesDf['clean_text'].apply(lambda x: RemoveFrequentWords(x, StopWords2))

print(PledgesDf.head())

# Tokenization

In [17]:
tokens = [nltk.word_tokenize(i) for i in PledgesDf["clean_text"]]

In [18]:
word_freq = {}
for sent in tokens:
    for i in sent:

        if i not in word_freq.keys():
            word_freq[i] = 1
        else:
            word_freq[i] += 1
len(word_freq)

3237

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')

tfidf_wm = tfidfvectorizer.fit_transform(PledgesDf["clean_text"])

tfidf_tokens = tfidfvectorizer.get_feature_names_out()
df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)

print("\nTD-IDF Vectorizer\n")
print(df_tfidfvect)

# Word2Vec

In [25]:
import numpy as np
import gensim.downloader as api

## Pre-trained

In [29]:
wv2 = api.load('word2vec-google-news-300')

### Mean Embedding

In [None]:
#building Word2Vec model with Average method
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))
    def fit(self, X, y):
            return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

w2v2 = dict(zip(wv2.index_to_key, wv2.vectors))
modelw2 = MeanEmbeddingVectorizer(w2v2)

# converting text to numerical data using Word2Vec
vectors_w2v2 = modelw2.transform(tokens)

print(vectors_w2v2)

In [None]:
DocIndexV1 = pd.DataFrame(vectors_w2v2)
IndexedPath = str(DirPpath.absolute()) + "\IndexedDataV1.csv"
DocIndexV1.to_csv(IndexedPath)

### Tf-Idf Embedding

In [32]:
#building Word2Vec model with TF-IDF method
class TfIdfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))
    def fit(self, X, y):
            return self

    def transform(self, X, tfidf):

        DocList = []
        i = 0
        
        for words in X:

            WordList = []

            for w in words:
                 
                try:
                    if w in self.word2vec:
                        weight = tfidf[w].iloc[i]
                        WordList.append(self.word2vec[w] * weight)
                    else:
                        WordList.append(np.zeros(self.dim))
                except:
                    WordList.append(np.zeros(self.dim))

            i+=1
            DocList.append(np.sum(np.array(WordList), axis = 0))
        
        return np.array(DocList)


w2v2 = dict(zip(wv2.index_to_key, wv2.vectors))
modelw2 = TfIdfEmbeddingVectorizer(w2v2)

# converting text to numerical data using Word2Vec
vectors_w2v2 = modelw2.transform(tokens, df_tfidfvect)

print(vectors_w2v2)

In [36]:
DocIndexV1 = pd.DataFrame(vectors_w2v2)
IndexedPath = str(DirPpath.absolute()) + "\IndexedDataV1Tf.csv"
DocIndexV1.to_csv(IndexedPath)


## Fine-Tuning Google 300 news Vectors

In [75]:
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# Setting the model with its hyperparameters
#model = Word2Vec(sentences = tokens, vector_size=300, min_count = 1)
model2 = Word2Vec(sentences = tokens, vector_size=300, min_count = 1)

In [62]:
model = KeyedVectors.load_word2vec_format(r"C:\Users\ecaudron001\Downloads\GoogleNews-vectors-negative300.bin",
                                         binary = True)

In [72]:
import numpy as np

In [78]:
model2.build_vocab(tokens)
total_examples = model2.corpus_count
model2.build_vocab([list(model2.wv.key_to_index.keys())], update=True)
model2.wv.intersect_word2vec_format(r"C:\Users\ecaudron001\Downloads\GoogleNews-vectors-negative300.bin", binary=True)
model2.train(tokens, total_examples=total_examples, epochs=model.iter)

IndexError: index 923 is out of bounds for axis 0 with size 1

In [None]:
model2.wv.