<a href="https://colab.research.google.com/github/RiccardoRubini93/ML-AI-cookbook/blob/main/similarity_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openpyxl
!pip install spacy
!pip install nltk

In [2]:
import pandas as pd
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import files

In [4]:
business_glossary_df = pd.read_csv('BUSINESS_GLOSSARY.csv',sep=';')
interfacce_df = pd.read_csv('Interfacce_normalized.csv',sep=';')

In [9]:
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('italian'))

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    return ' '.join(tokens)

business_glossary_df['BUSINESS TERM'] = business_glossary_df['BUSINESS TERM'].apply(tokenize_and_remove_stopwords)
interfacce_df['CAMPO FISICO'] = interfacce_df['CAMPO FISICO'].apply(tokenize_and_remove_stopwords)
#interfacce_df['DESCRIZIONE'] = interfacce_df['DESCRIZIONE'].apply(tokenize_and_remove_stopwords)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_business = tfidf_vectorizer.fit_transform(business_glossary_df['BUSINESS TERM'])
tfidf_matrix_interfacce = tfidf_vectorizer.transform(interfacce_df['CAMPO FISICO'])

In [17]:
cosine_similarities = cosine_similarity(tfidf_matrix_interfacce, tfidf_matrix_business)

In [38]:
cosine_similarities.shape

(4455, 2779)

In [None]:
results = []
for idx, row in enumerate(interfacce_df.iterrows()):
  print(idx)
  print(row)
  similar_business_term_idx = cosine_similarities[idx].argmax()
  business_term = business_glossary_df.iloc[similar_business_term_idx]['BUSINESS TERM']
  campo_fisico = row[1]['CAMPO FISICO']
  results.append((campo_fisico, business_term))

In [None]:
results_df = pd.DataFrame(results, columns=['CAMPO FISICO', 'BUSINESS TERM'])
results_df.head()

In [28]:
#define score: number of recognised terms / total
results_df.shape[0]

4455

In [33]:
df = results_df

In [41]:
df_filtered = df[(df['CAMPO FISICO'] != '') & (df['BUSINESS TERM'] != '')]

In [42]:
df_filtered.to_csv('result.csv')