# Features de TextMining:

---------------------------------

### Importando depend√™ncias

In [59]:
import os
import sys
import re
import datetime
import dateutil

sys.path.insert(0,os.path.dirname(os.getcwd()))
sys.path.insert(0,os.path.join(os.getcwd(),'grobid'))
sys.path.insert(0,os.getcwd())

import numpy as np
import pandas as pd

from grobid import grobid_client
import grobid_tei_xml
from grobid_to_dataframe import grobid_cli, xmltei_to_dataframe

import plotly

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

!pip install nltk --upgrade

!pip install gensim --upgrade

!pip install spacy --upgrade

!pip install stanford-corenlp --upgrade

!pip install corenlp --upgrade

!pip install textblob --upgrade

!pip install transformers --upgrade

In [3]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

---------------------------------

### Comandos Docker

docker run -t --rm --init -p 8080:8070 -p 8081:8071 --memory="9g" lfoppiano/grobid:0.7.0

docker run -t --rm --init -p 8080:8070 -p 8081:8071 lfoppiano/grobid:0.6.2

pip install grobid-tei-xml

### Definindo vari√°veis e caminhos

In [4]:
path = os.path.dirname(os.getcwd())
path_input = os.path.join(path,'artifacts','articles','ml_material','teste')

---------------------------------

### Fun√ß√µes para execu√ß√£o em batch

In [5]:
def get_path(path_input_path):
    """"""
    if os.path.exists(path_input_path):
        return path_input_path
    
    return os.getcwd()


def batch_process_path(path_input_path, n_workers=2,
                       check_cache=True,
                       cache_folder_name='summarticles_cache',
                       config_path="./grobid/config.json"):
    
    """"""
    
    gcli = grobid_cli(config_path=config_path)
    result_batch = gcli.process_pdfs(input_path=path_input_path,
                                     check_cache=check_cache,
                                     cache_folder_name=cache_folder_name,
                                     n_workers=n_workers,
                                     service="processFulltextDocument",
                                     generateIDs=True,
                                     include_raw_citations=True,
                                     include_raw_affiliations=True,
                                     consolidate_header=False,
                                     consolidate_citations=False,
                                     tei_coordinates=False,
                                     segment_sentences=True,
                                     verbose=True)
    return result_batch


def get_dataframes(result_batch):
    
    """"""
    
    xml_to_df = xmltei_to_dataframe()
    dict_dfs, dic_errors = xml_to_df.get_dataframe_articles(result_batch)
    
    return dict_dfs, dic_errors


def files_path(path):
    list_dir = os.listdir(path)
    files = []
    for file in list_dir:
        if os.path.isfile(os.path.join(path,file)):
            files.append(os.path.join(path,file))
    return files

In [6]:
def run_batch_process(path_input, n_workers=6, check_cache=True, 
                      cache_folder_name='summarticles_cache', 
                      config_path="./grobid/config.json"):

    dict_exec = {'path':path_input}
    dict_exec['start_datetime'] = datetime.datetime.now()
    
    # path_input = os.path.join(path,'artifacts','test_article')
    config_path = os.path.join(os.getcwd(),'grobid','config.json')
    dict_exec['grobid_config'] = config_path
    
    gcli = grobid_client.GrobidClient(config_path=config_path, check_server=False)
    
    dict_exec['files'] = gcli.get_input_files(path_input)
    dict_exec['num_files'] = len(dict_exec['files'])
    dict_exec['n_workers'] = n_workers
    
    path_input_path = get_path(path_input)
    result_batch = batch_process_path(path_input_path, n_workers=dict_exec['n_workers'], check_cache=check_cache)
    dict_dfs, dic_errors = get_dataframes(result_batch)
    
    gcli.save_xmltei_files(result_batch, input_folder_path, cache_folder_name=cache_folder_name)
    
    dict_exec['end_datetime'] = datetime.datetime.now()
    dict_exec['time_exec_sec'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    dict_exec['time_exec_min'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    
    return dict_dfs, dict_exec, dic_errors


def tokenize_data(df_colum):
    """"""
        

In [7]:
input_folder_path = r"""C:\Users\vierb\OneDrive\√Årea de Trabalho\Projetos\PGC\artifacts\articles\ml_material"""

In [8]:
%%time
dict_dfs, dict_exec, dic_errors = run_batch_process(path_input=input_folder_path, 
                                                    n_workers=10, 
                                                    check_cache=True, 
                                                    cache_folder_name='summarticles_cache', 
                                                    config_path="./grobid/config.json")

GROBID server is up and running
587 files to process in current batch
[Input Files] 587
[Cache Files] 587
In the end, we have: 0  new files to process!
And we have : 587  files to back from cache!
Processed articles: 581
Number articles with errors: 6
Wall time: 26.1 s


In [9]:
print(dict_exec.keys())

dict_keys(['path', 'start_datetime', 'grobid_config', 'files', 'num_files', 'n_workers', 'end_datetime', 'time_exec_sec', 'time_exec_min'])


In [10]:
dic_errors

{'number_article_error': 6,
 'list_article_error': [{'file': 'C:\\Users\\vierb\\OneDrive\\√Årea de Trabalho\\Projetos\\PGC\\artifacts\\articles\\ml_material\\S1006-706X(14)60038-8.pdf',
   'error': ValueError,
   'error_text': 'If using all scalar values, you must pass an index',
   'keys_dict': dict_keys(['grobid_version', 'grobid_timestamp', 'header', 'pdf_md5', 'language_code', 'citations', 'body'])},
  {'file': 'C:\\Users\\vierb\\OneDrive\\√Årea de Trabalho\\Projetos\\PGC\\artifacts\\articles\\ml_material\\advs.201903667.pdf',
   'error': ValueError,
   'error_text': 'If using all scalar values, you must pass an index',
   'keys_dict': dict_keys(['grobid_version', 'grobid_timestamp', 'header', 'pdf_md5', 'language_code', 'citations', 'body'])},
  {'file': 'C:\\Users\\vierb\\OneDrive\\√Årea de Trabalho\\Projetos\\PGC\\artifacts\\articles\\ml_material\\j.commatsci.2020.109782.pdf',
   'error': xml.etree.ElementTree.ParseError,
   'error_text': 'syntax error: line 1, column 0',
   'ke

In [11]:
dict_dfs['df_doc_info'].head(3).T

pdf_md5,6332D3DE6B6FF1AC37060940E442E6BF,5327DC684E75934490CDECDAFC576817,F94E4AADF7F858DE005046B0D006E830
grobid_version,0.7.0,0.7.0,0.7.0
grobid_timestamp,2022-05-15 02:08:00,2022-05-15 02:08:00,2022-05-15 02:08:00
language_code,en,en,en
acknowledgement,Acknowledgements Dr. Wilson,Acknowledgements The authors acknowledge helpf...,Acknowledgements The authors acknowledge the s...
abstract,An artificial neural network (ANN) model is de...,Bayesian optimization (BO) has emerged as the ...,"In this paper, a novel lattice constant predic..."
body,Introduction Beta transus √∞b tr √û temperature ...,Introduction Automated high-throughput experim...,Introduction In the study of crystalline mater...
annex,,,
file,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...
status,status 200,status 200,status 200
raw_data,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."


In [12]:
dict_dfs['df_doc_head'].head(3).T

pdf_md5,6332D3DE6B6FF1AC37060940E442E6BF,5327DC684E75934490CDECDAFC576817,F94E4AADF7F858DE005046B0D006E830
index_head,,,
id_head,,,
unstructured_head,,,
date_head,,,
title_head,Modelling beta transus temperature of titanium...,Extrapolative Bayesian Optimization with Gauss...,Lattice constant prediction of orthorhombic AB...
book_title_head,,,
series_title_head,,,
journal_head,,,
journal_abbrev_head,,,
publisher_head,,,


In [13]:
dict_dfs['df_doc_authors'].head(3).T

pdf_md5,6332D3DE6B6FF1AC37060940E442E6BF,6332D3DE6B6FF1AC37060940E442E6BF.1,6332D3DE6B6FF1AC37060940E442E6BF.2
full_name_author,Z Guo,S Malinov,W Sha
given_name_author,Z,S,W
middle_name_author,,,
surname_author,Guo,Malinov,Sha
email_author,,,w.sha@qub.ac.uk
orcid_author,,,
institution_author,The Queen's University of Belfast,The Queen's University of Belfast,The Queen's University of Belfast
department_author,School of Civil Engineering,School of Civil Engineering,School of Civil Engineering
laboratory_author,Metals Research Group,Metals Research Group,Metals Research Group
addr_line_author,,,


In [14]:
dict_dfs['df_doc_citations'].head(5)

Unnamed: 0_level_0,index_citation,id_citation,unstructured_citation,date_citation,title_citation,book_title_citation,series_title_citation,journal_citation,journal_abbrev_citation,publisher_citation,...,first_page_citation,last_page_citation,note_citation,doi_citation,pmid_citation,pmcid_citation,arxiv_id_citation,ark_citation,istex_id_citation,url_citation
pdf_md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6332D3DE6B6FF1AC37060940E442E6BF,0,b0,"J.C. Williams, E.A. Starke, The role of thermo...",1984,The role of thermomechanical processing in tai...,"Deformation, Processing, and Structure",,,,American Society for Metals,...,,,,,,,,,,
6332D3DE6B6FF1AC37060940E442E6BF,1,b1,"V.G. Krishna, Y.V.R.K. Prasad, N.C. Birla, G.S...",1997,Processing map for the hot working of near-a t...,,,Journal of Materials Processing Technology,,,...,377.0,383.0,,,,,,,,
6332D3DE6B6FF1AC37060940E442E6BF,2,b2,"C.F. Yolton, F.H. Froes, R.F. Malone, Alloying...",1979,Alloying element effects in metastable beta ti...,,,Metallurgical Transactions A,,,...,132.0,134.0,,,,,,,,
6332D3DE6B6FF1AC37060940E442E6BF,3,b3,"P.J. Bania, Beta titanium alloys and their rol...",1994,Beta titanium alloys and their role in the tit...,,,"Journal of the Minerals, Metals and Material S...",,,...,16.0,19.0,,,,,,,,
6332D3DE6B6FF1AC37060940E442E6BF,4,b4,"S. Ankem, G.K. Scarr, I.L. Caplan, J.C. Willia...",1988-09,Multiple regression analysis of the effects of...,Proceedings of 6th World Conference on Titanium,,,,Societe Francaise de Metallurgie,...,265.0,268.0,,,,,,,,


In [15]:
dict_dfs['df_doc_authors_citations'].head(5)

Unnamed: 0_level_0,id,index,full_name_citation,given_name_citation,middle_name_citation,surname_citation,email_citation,orcid_citation,institution_citation,department_citation,laboratory_citation,addr_line_citation,post_code_citation,settlement_citation,country_citation
pdf_md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6332D3DE6B6FF1AC37060940E442E6BF,b0,0,J C Williams,J,C,Williams,,,,,,,,,
6332D3DE6B6FF1AC37060940E442E6BF,b0,0,E A Starke,E,A,Starke,,,,,,,,,
6332D3DE6B6FF1AC37060940E442E6BF,b1,1,V G Krishna,V,G,Krishna,,,,,,,,,
6332D3DE6B6FF1AC37060940E442E6BF,b1,1,Y V R K Prasad,Y,V R K,Prasad,,,,,,,,,
6332D3DE6B6FF1AC37060940E442E6BF,b1,1,N C Birla,N,C,Birla,,,,,,,,,


---------------------------------

### Trabalhando no tratamento do texto

!pip install pytorch --upgrade

!pip install tensorflow --upgrade --user

In [16]:
import nltk
import spacy
import corenlp
import textblob
import gensim
#import transformers

In [24]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vierb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vierb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [55]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vierb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [57]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vierb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


True

--------------------------------

Estudando a distribui√ß√£o de caracteres especiais:

In [17]:
list_chars = []
for id,row in dict_dfs['df_doc_info'].iterrows():
    for c in row['raw_data']:
        list_chars.append(c)
        
df_counts = pd.DataFrame({'chars':pd.value_counts(list_chars).index.tolist(),'counts':pd.value_counts(list_chars).tolist()})
df_counts = df_counts.sort_values(by='counts',ascending=False)

import plotly.express as px
fig = px.bar(df_counts.head(20), x='chars', y='counts')
fig.show()

In [18]:
df_counts.sort_values(by='counts',ascending=False).tail(30)

Unnamed: 0,chars,counts
741,ùüï,1
740,ùë∞,1
739,ùê®,1
742,–∏,1
749,‚î¨,1
760,‚áë,1
726,‚Ö•,1
750,‚ë•,1
752,·âÄ,1
753,·âÅ,1


--------------------------------

Tratando texto:

In [17]:
def text_tokenize(text, language='english', preserve_line=False):
    return nltk.tokenize.word_tokenize(text, language=language, preserve_line=preserve_line)

def clean_text_regex(words_list, regex="[^a-zA-Z]+", replace='', min_word_len=1):
    """Testado em https://regex101.com/"""
    new_words = []
    for word in words_list:
        word = re.sub(regex, replace, word)
        if len(word) > min_word_len:
            new_words.append(word)
    return new_words

def remove_stopwords(words_list, stopwords_list):
    """"""
    new_words = []
    for word in words_list:
        if word not in stopwords_list:
            new_words.append(word)
    return new_words

def lemmatizer(words_list):
    """"""
    obj_lemmatizer = nltk.stem.WordNetLemmatizer()
    words_lemma = []
    for word in words_list:
        words_lemma.append(obj_lemmatizer.lemmatize(word,pos=nltk.corpus.wordnet.VERB))
    return words_lemma

def stem_text(words_list):
    """"""
    p_stem = nltk.stem.PorterStemmer()
    words_stem = []
    for word in words_list:
        words_stem.append(p_stem.stem(word))
    return words_stem

In [18]:
def text_prep(text, clean_text=True, stopwords_remove=True, exec_lemmatizer=True, exec_stem=False, text_lower=False, stopwords_list=[], language='english',
              preserve_line=False, regex_chars_clean="[^a-zA-Z]+", replace_chars_clean='', min_word_len=1):
    
    """Text preparation."""
    
    text_preparation = text_tokenize(text, language=language, preserve_line=preserve_line)
    if clean_text:
        text_preparation = clean_text_regex(words_list=text_preparation,
                                            regex=regex_chars_clean,
                                            replace=replace_chars_clean,
                                            min_word_len=min_word_len)
    if stopwords_remove:
        text_preparation = remove_stopwords(words_list=text_preparation,
                                            stopwords_list=stopwords_list)
    if exec_lemmatizer:
        text_preparation = lemmatizer(words_list=text_preparation)
    if exec_stem:
        text_preparation = stem_text(words_list=text_preparation)
    text_preparation = ' '.join(text_preparation)
    if text_lower:
        text_preparation = text_preparation.lower()
    return text_preparation

In [19]:
def text_prep_column(colum_df):
    """"""
    f_prep_text = lambda text_data: text_prep(text=text_data, clean_text=True, stopwords_remove=True, exec_lemmatizer=True, exec_stem=False, 
                                            stopwords_list=nltk.corpus.stopwords.words('english'), language='english', preserve_line=False,
                                            regex_chars_clean="[^a-zA-Z]+", replace_chars_clean='', min_word_len=1, text_lower=True)
    colum_df = colum_df.apply(lambda e: e if pd.isna(e) else f_prep_text(e))
    return colum_df

In [20]:
dict_dfs['df_doc_info'].abstract.iat[0]

'An artificial neural network (ANN) model is developed to simulate the non-linear relationship between the beta transus √∞b tr √û temperature of titanium alloys and the alloy chemistry. The input parameters to the model consist of the concentration of nine elements, i.e. Al, Cr, Fe, Mo, Sn, Si, V, Zr and O, whereas the model output is the b tr temperature. Good performance of the ANN model was achieved. The interactions between the alloying elements were estimated based on the obtained ANN model. The results showed good agreement with experimental data. The influence of the database scale on ANN model performance was also discussed. Estimation of b tr temperature through thermodynamic calculation was carried out as a comparison.'

In [21]:
text_prep(text=dict_dfs['df_doc_info'].abstract.iat[0],
          clean_text=True,
          stopwords_remove=True,
          exec_lemmatizer=True,
          exec_stem=False,
          text_lower=True,
          stopwords_list=nltk.corpus.stopwords.words('english'),
          language='english',
          preserve_line=False,
          regex_chars_clean="[^a-zA-Z]+",
          replace_chars_clean='',
          min_word_len=1)

'an artificial neural network ann model develop simulate nonlinear relationship beta transus tr temperature titanium alloy alloy chemistry the input parameters model consist concentration nine elements ie al cr fe mo sn si zr whereas model output tr temperature good performance ann model achieve the interactions alloy elements estimate base obtain ann model the result show good agreement experimental data the influence database scale ann model performance also discuss estimation tr temperature thermodynamic calculation carry comparison'

In [22]:
dict_dfs['df_doc_info'].head(3).T

pdf_md5,6332D3DE6B6FF1AC37060940E442E6BF,5327DC684E75934490CDECDAFC576817,F94E4AADF7F858DE005046B0D006E830
grobid_version,0.7.0,0.7.0,0.7.0
grobid_timestamp,2022-05-15 02:08:00,2022-05-15 02:08:00,2022-05-15 02:08:00
language_code,en,en,en
acknowledgement,Acknowledgements Dr. Wilson,Acknowledgements The authors acknowledge helpf...,Acknowledgements The authors acknowledge the s...
abstract,An artificial neural network (ANN) model is de...,Bayesian optimization (BO) has emerged as the ...,"In this paper, a novel lattice constant predic..."
body,Introduction Beta transus √∞b tr √û temperature ...,Introduction Automated high-throughput experim...,Introduction In the study of crystalline mater...
annex,,,
file,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...
status,status 200,status 200,status 200
raw_data,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."


In [24]:
dict_dfs['df_doc_info']['acknowledgement_prep'] = text_prep_column(dict_dfs['df_doc_info']['acknowledgement'])
dict_dfs['df_doc_info']['abstract_prep'] = text_prep_column(dict_dfs['df_doc_info']['abstract'])
dict_dfs['df_doc_info']['body_prep'] = text_prep_column(dict_dfs['df_doc_info']['body'])

In [25]:
dict_dfs['df_doc_info'].head(3).T

pdf_md5,6332D3DE6B6FF1AC37060940E442E6BF,5327DC684E75934490CDECDAFC576817,F94E4AADF7F858DE005046B0D006E830
grobid_version,0.7.0,0.7.0,0.7.0
grobid_timestamp,2022-05-15 02:08:00,2022-05-15 02:08:00,2022-05-15 02:08:00
language_code,en,en,en
acknowledgement,Acknowledgements Dr. Wilson,Acknowledgements The authors acknowledge helpf...,Acknowledgements The authors acknowledge the s...
abstract,An artificial neural network (ANN) model is de...,Bayesian optimization (BO) has emerged as the ...,"In this paper, a novel lattice constant predic..."
body,Introduction Beta transus √∞b tr √û temperature ...,Introduction Automated high-throughput experim...,Introduction In the study of crystalline mater...
annex,,,
file,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...
status,status 200,status 200,status 200
raw_data,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."


--------------------------------

Criando BOW e TFIDF:

In [34]:
documents = dict_dfs['df_doc_info']['abstract_prep'].fillna(' ').tolist()

In [44]:
obj_bow = CountVectorizer(encoding="utf-8",
                          stop_words="english",
                          strip_accents="ascii",
                          lowercase=True, 
                          preprocessor=None,
                          tokenizer=None,
                          token_pattern=r"""(?u)\b\w\w+\b""",
                          ngram_range=(1,2), # Unigram and bigram
                          analyzer="word",
                          max_df=1.0,
                          min_df=2, # May have at least 2 frequency
                          max_features=None, 
                          vocabulary= None, 
                          binary=False, 
                          dtype=np.int64)

obj_tfidf = TfidfVectorizer(encoding="utf-8",
                            stop_words="english",
                            strip_accents="ascii",
                            lowercase=True, 
                            preprocessor=None,
                            tokenizer=None,
                            token_pattern=r"""(?u)\b\w\w+\b""",
                            ngram_range=(1,2), # Unigram and bigram
                            analyzer="word",
                            max_df=1.0,
                            min_df=2, # May have at least 2 frequency
                            max_features=None, 
                            vocabulary= None, 
                            binary=False, 
                            dtype=np.float64, 
                            norm='l2', 
                            use_idf=True, 
                            smooth_idf=True, 
                            sublinear_tf=False)


In [50]:
obj_bow = obj_bow.fit(raw_documents=documents)
bow_matrix = obj_bow.transform(documents)

obj_tfidf = obj_tfidf.fit(raw_documents=documents)
tfidf_matrix = obj_tfidf.transform(documents)

type(bow_matrix), type(tfidf_matrix)

(scipy.sparse.csr.csr_matrix, scipy.sparse.csr.csr_matrix)

In [51]:
bow_matrix = bow_matrix.todense()
tfidf_matrix = tfidf_matrix.todense()

df_bow = pd.DataFrame(bow_matrix, columns=obj_bow.get_feature_names())
df_tfidf = pd.DataFrame(tfidf_matrix, columns=obj_tfidf.get_feature_names())

In [54]:
df_bow.head(3)

Unnamed: 0,aa,aa aluminum,aare,ab,ab initio,abaqus,abilities,ability,ability feasible,ability gfa,...,zirconia,zn,zn alloy,zn coat,zncl,zone,zone fz,zr,zr hf,zro
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
df_tfidf.head(3)

Unnamed: 0,aa,aa aluminum,aare,ab,ab initio,abaqus,abilities,ability,ability feasible,ability gfa,...,zirconia,zn,zn alloy,zn coat,zncl,zone,zone fz,zr,zr hf,zro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116469,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
df_bow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581 entries, 0 to 580
Columns: 7503 entries, aa to zro
dtypes: int64(7503)
memory usage: 33.3 MB


In [57]:
df_tfidf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581 entries, 0 to 580
Columns: 7503 entries, aa to zro
dtypes: float64(7503)
memory usage: 33.3 MB


--------------------------------

Trabalhando com a similaridade de Cosseno:

In [72]:
cos_bow_sim = cosine_similarity(df_bow, df_bow)
cos_tfidf_sim = cosine_similarity(df_tfidf, df_tfidf)

df_cos_bow_sim = pd.DataFrame(cos_bow_sim,
                              columns=dict_dfs['df_doc_info'].index.tolist(),
                              index=dict_dfs['df_doc_info'].index.tolist())

df_cos_tfidf_sim = pd.DataFrame(cos_tfidf_sim,
                                columns=dict_dfs['df_doc_info'].index.tolist(),
                                index=dict_dfs['df_doc_info'].index.tolist())

In [73]:
df_cos_bow_sim.head()

Unnamed: 0,6332D3DE6B6FF1AC37060940E442E6BF,5327DC684E75934490CDECDAFC576817,F94E4AADF7F858DE005046B0D006E830,2F1301D499B3BDB11DFBE8201686EA18,A33351B001C1500CB2BF1227F8AC9D9E,70673FBB0089E04DB5D6CC611A0D8F0D,5EEB701057AB0C125663AC0D254871E9,5B37679352BC406F00BB87D4F9CF28A5,C1B73FDF87CDE6D214185A958012F223,0745B34B603E61EC4149D12BC1B56D17,...,9B2CF9059DC905609B59344759B50BD4,2B0A6DCFA10AEA8ED139D4DE6F42C187,618B4E34BF9BB899076ABE0EFC7A9958,10F53F194680D7F2C761D68F84EE8F05,E30261EE71921D1C94A40713191D2468,13EF01368C4B64913BAC3B6B06D855FF,30FBD2C323795A71AA5B542BD2F995E4,55180AD55B60329BA62EF0C69787B7D0,16084FC6B5C25AF4A5F11957425A9396,02484B814DFB99BFC7284F0A29BAE85A
6332D3DE6B6FF1AC37060940E442E6BF,1.0,0.191937,0.230269,0.161723,0.156277,0.092677,0.173883,0.0,0.114355,0.038244,...,0.305873,0.04018,0.101259,0.410715,0.0,0.0,0.168433,0.0,0.0,0.044319
5327DC684E75934490CDECDAFC576817,0.191937,1.0,0.124747,0.112436,0.150869,0.134125,0.118618,0.0,0.159009,0.176627,...,0.121019,0.107749,0.128624,0.138128,0.007339,0.006833,0.116683,0.0,0.007339,0.171668
F94E4AADF7F858DE005046B0D006E830,0.230269,0.124747,1.0,0.143066,0.163528,0.180702,0.10409,0.0,0.144518,0.028999,...,0.353599,0.082261,0.127294,0.185007,0.0,0.0,0.210731,0.0,0.0,0.061609
2F1301D499B3BDB11DFBE8201686EA18,0.161723,0.112436,0.143066,1.0,0.091547,0.122798,0.147429,0.0,0.086838,0.065344,...,0.102808,0.082381,0.062439,0.148883,0.0,0.0,0.123335,0.0,0.0,0.165867
A33351B001C1500CB2BF1227F8AC9D9E,0.156277,0.150869,0.163528,0.091547,1.0,0.130405,0.079262,0.0,0.069049,0.162368,...,0.178823,0.040941,0.086885,0.099443,0.008366,0.007789,0.141588,0.0,0.008366,0.110386


In [74]:
df_cos_tfidf_sim.head()

Unnamed: 0,6332D3DE6B6FF1AC37060940E442E6BF,5327DC684E75934490CDECDAFC576817,F94E4AADF7F858DE005046B0D006E830,2F1301D499B3BDB11DFBE8201686EA18,A33351B001C1500CB2BF1227F8AC9D9E,70673FBB0089E04DB5D6CC611A0D8F0D,5EEB701057AB0C125663AC0D254871E9,5B37679352BC406F00BB87D4F9CF28A5,C1B73FDF87CDE6D214185A958012F223,0745B34B603E61EC4149D12BC1B56D17,...,9B2CF9059DC905609B59344759B50BD4,2B0A6DCFA10AEA8ED139D4DE6F42C187,618B4E34BF9BB899076ABE0EFC7A9958,10F53F194680D7F2C761D68F84EE8F05,E30261EE71921D1C94A40713191D2468,13EF01368C4B64913BAC3B6B06D855FF,30FBD2C323795A71AA5B542BD2F995E4,55180AD55B60329BA62EF0C69787B7D0,16084FC6B5C25AF4A5F11957425A9396,02484B814DFB99BFC7284F0A29BAE85A
6332D3DE6B6FF1AC37060940E442E6BF,1.0,0.060729,0.059053,0.103321,0.032481,0.028514,0.081574,0.0,0.04358,0.005874,...,0.121544,0.009373,0.016515,0.222639,0.0,0.0,0.081921,0.0,0.0,0.019239
5327DC684E75934490CDECDAFC576817,0.060729,1.0,0.027218,0.037283,0.049344,0.043895,0.049959,0.0,0.064412,0.063724,...,0.024795,0.047578,0.033143,0.041287,0.002157,0.001891,0.036318,0.0,0.002157,0.045884
F94E4AADF7F858DE005046B0D006E830,0.059053,0.027218,1.0,0.046784,0.040517,0.080607,0.026487,0.0,0.052786,0.004541,...,0.131502,0.025427,0.028411,0.048982,0.0,0.0,0.074905,0.0,0.0,0.019175
2F1301D499B3BDB11DFBE8201686EA18,0.103321,0.037283,0.046784,1.0,0.037026,0.042007,0.059164,0.0,0.027523,0.019241,...,0.035141,0.026259,0.013156,0.061338,0.0,0.0,0.03957,0.0,0.0,0.086019
A33351B001C1500CB2BF1227F8AC9D9E,0.032481,0.049344,0.040517,0.037026,1.0,0.067369,0.027093,0.0,0.024666,0.059218,...,0.041285,0.010179,0.020252,0.019537,0.005228,0.004583,0.055891,0.0,0.005228,0.056576
