<a href="https://colab.research.google.com/github/Technical-Debt-Large-Scale/qualification/blob/main/python/analysis/extractionatd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configuração

In [None]:
!git clone https://github.com/Technical-Debt-Large-Scale/qualification.git

In [3]:
import os
import pandas as pd

pd.set_option('display.max_colwidth', 500)
file_xls_to_scan = 'qualification/xls/mergepatdsp.xls'

# Leitura dos dados

In [None]:
!pip install --upgrade xlrd

In [4]:
df_my_xls_data = pd.read_excel(file_xls_to_scan)

In [5]:
df_data = df_my_xls_data
df_bib_sort_by_year = df_data.copy()
df_bib_sort_by_year['year'] = pd.to_numeric(df_bib_sort_by_year['year'])
df_bib_sort_by_year = df_bib_sort_by_year.sort_values('year')

df_data = df_bib_sort_by_year.copy()

In [None]:
df_data.head(3)

# Analisa os conteudos dos Abstracts

## Analise via NLTK

In [None]:
# Remove os abstracts vazios
df_data = df_data[['key', 'year', 'list_authors', 'title', 'abstract', 'link']]
df_data = df_data.dropna()

df_all_papers = df_data.copy()
df_data.head(3)

In [None]:
# Instala a suite NLTK https://www.nltk.org/
!pip install nltk

In [None]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
# Concatena todos os abstracts em uma unica sentenca
list_abstract = df_data.abstract.to_list()
all_abstract_in_one = ' '.join(list_abstract)

# Tokenization
tokens_abstract = nltk.word_tokenize(all_abstract_in_one)
tokens_abstract

In [None]:
# Converte todos os tokens de todos os abstracts em minusculo
[item.lower() for item in tokens_abstract]

In [None]:
# Recupera regras de frases em ingles
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
tokenizer.tokenize('Hello.  This is a test.  It works!')

In [None]:
# Guarda em uma lista todas as frases que contem Architectural Technical Debt
list_atd = []
list_atd_aux = []
for abstract in list_abstract:
  list_temp = tokenizer.tokenize(abstract)
  for item in list_temp:
    item_to_lower = item.lower()
    atd_lower = 'Architectural Technical Debt'.lower()
    if atd_lower in item_to_lower:
      list_atd.append(item)

In [None]:
# Faz o merge de todos as sentencas que contem ATD
all_atd_abstract_in_one = ' '.join(list_atd)

# Tokenization
tokens_atd_abstract = nltk.word_tokenize(all_atd_abstract_in_one)

# Lista de todos os tokens de todos os atd abstracts em minusculo
list_of_tokens_atd_abstract = [item.lower() for item in tokens_atd_abstract]

In [None]:
# Faz o merge de todos as sentencas que contem ATD
all_atd_abstract_in_one = ' '.join(list_atd)

# Tokenization
tokens_atd_abstract = nltk.word_tokenize(all_atd_abstract_in_one)

# Lista de todos os tokens de todos os atd abstracts em minusculo
list_of_tokens_atd_abstract = [item.lower() for item in tokens_atd_abstract]

In [None]:
# Faz a contagem das palavras e guarda em um dicionario
from collections import Counter

counts_atd_words = Counter(list_of_tokens_atd_abstract)
counts_atd_words

dict_counts_atd_words = dict(counts_atd_words)

In [None]:
# Ordena o dicionario de palavras por valor crescente
# Mostra as palavras que mais se repetem
sorted(dict_counts_atd_words.items(), key=lambda x: x[1])

## Remove os stop words

In [None]:
all_atd_abstract_in_one

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))

tokenizer = nltk.RegexpTokenizer(r"\w+")
new_words = tokenizer.tokenize(all_atd_abstract_in_one)

#word_tokens = word_tokenize(all_atd_abstract_in_one)
word_tokens = new_words 

filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
 
print(word_tokens)
print(filtered_sentence)

In [None]:
# Lista de todos os tokens de todos os atd abstracts em minusculo sem o stop words
list_of_tokens_atd_abstract_no_ponctuation = [item.lower() for item in filtered_sentence]

# Faz a contagem das palavras e guarda em um dicionario
from collections import Counter

counts_atd_words_no_ponctuation = Counter(list_of_tokens_atd_abstract_no_ponctuation)

dict_counts_atd_words_no_ponctuation = dict(counts_atd_words_no_ponctuation)

In [None]:
# Ordena o dicionario de palavras, sem os stop words, por valor crescente 
sorted(dict_counts_atd_words_no_ponctuation.items(), key=lambda x: x[1])

## Word *cloud*

In [None]:
!git clone https://github.com/amueller/word_cloud.git

In [None]:
!cd word_cloud && pip install .

In [None]:
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

In [None]:
# Generate a Word of Cloud about each file according frequence
def generateWordCloud(name, counterWithFrequency, my_path='qualification/wordcloud'):
  try:
    wordcloud = WordCloud(width = 1200, height = 1000, random_state=1, background_color='black', colormap='Set2', collocations=False)
    wordcloud.generate_from_frequencies(frequencies=counterWithFrequency)
    # Display the generated image:
    fileName = my_path + '/' + name + ".png"
    wordcloud.to_file(fileName)
    print(f'Arquivo {fileName} gerado com sucesso!')
  except Exception as ex:
    print(f'Erro ao gerar o arquivo {name}: {str(ex)}')

# Generate a Word of Cloud about text
def generateWordCloud2(name, my_text, my_path='qualification/wordcloud'):
  try:
    wordcloud = WordCloud(width = 1200, height = 1000, random_state=1, background_color='black', colormap='Set2', collocations=False)
    wordcloud.generate(my_text)
    # Display the generated image:
    fileName = my_path + '/' + name + ".png"
    wordcloud.to_file(fileName)
    print(f'Arquivo {fileName} gerado com sucesso!')
  except Exception as ex:
    print(f'Erro ao gerar o arquivo {name}: {str(ex)}')

# Generate a Word of Cloud about text
def generateWordCloud3(name, my_text, my_path='qualification/wordcloud'):
  try:
    wordcloud = WordCloud(width=1200, height=1000).generate(my_text)
    wordcloud.generate(my_text)
    # Display the generated image:
    fileName = my_path + '/' + name + ".png"
    wordcloud.to_file(fileName)
    print(f'Arquivo {fileName} gerado com sucesso!')
  except Exception as ex:
    print(f'Erro ao gerar o arquivo {name}: {str(ex)}')

In [None]:
generateWordCloud('word_cloud_atd_no_ponctuation', dict_counts_atd_words_no_ponctuation)
generateWordCloud2('word_cloud_atd_all_in_one', all_atd_abstract_in_one)
generateWordCloud3('word_cloud_atd_all_in_one2', all_atd_abstract_in_one)

# Analisa apenas os papers que tenham ATD no Abstract

df_all_papers.info()

In [None]:
abstract_with_atd = df_all_papers.abstract.str.contains('Architectural Technical Debt', case=False)
df_atd_papers = df_all_papers[abstract_with_atd]

In [None]:
df_atd_papers.info()

In [None]:
df_atd_papers.groupby(['year']).size()

In [None]:
pd.set_option('display.max_colwidth', 1000)

In [None]:
df_atd_papers[['key', 'year', 'title', 'list_authors']]

In [None]:
# Cria uma lista de elementos contendo a key e as setencas do abstract que contem ATD
list_key_abstract_atd = []
for each in list_atd:
  for index, row in df_atd_papers.iterrows():
    if each in row['abstract']:
      elemento = (row['key'], each)
      list_key_abstract_atd.append(elemento)
    
# Cria um dicionario onde a chave é o author principal e o value é a lista de conceitos de ATD
dict_key_abstract_atd = {}
list_of_key = []

for each_elemento in list_key_abstract_atd:
  list_of_key.append(each_elemento[0])
set_of_key = set(list_of_key)
list_of_key = list(set_of_key)

list_sentence_aux = []
for item in list_of_key: 
  for each_elemento in list_key_abstract_atd:
    if item==each_elemento[0]:
      list_sentence_aux.append(each_elemento[1])
  dict_key_abstract_atd[item] = list_sentence_aux
  list_sentence_aux = []

# Cria um dicionario contendo key, authors, year, title e conceito de atd
dict_key_authors_year_title_atd = {}
list_atd_key = []
list_atd_authors = []
list_atd_year = []
list_atd_title = []
list_atd_concept = []
list_atd_link = []
for key, value in dict_key_abstract_atd.items():
  for index, row in df_atd_papers.iterrows():
    if key==row['key']:
      list_atd_key.append(key)
      list_atd_authors.append(row['list_authors'])
      list_atd_year.append(row['year'])
      list_atd_title.append(row['title'])
      list_atd_concept.append(value)
      list_atd_link.append(row['link'])

dict_key_authors_year_title_atd = {'key':list_atd_key, 'year':list_atd_year, 'title':list_atd_title, 'authors':list_atd_authors, 'atd':list_atd_concept, 'link':list_atd_link}

In [None]:
df_key_authors_year_title_atd = pd.DataFrame(dict_key_authors_year_title_atd)
df_key_authors_year_title_atd.sort_values(by='year')[['year', 'title', 'authors', 'atd']]

In [1]:
def create_html_file(arquivo_html, df_data):
    print('Criando o arquivo {}'.format(arquivo_html))
    try: 
        with open(arquivo_html, 'w', encoding='utf-8') as file_html:
            file_html.write('<html>')
            file_html.write('<head>')
            file_html.write('<meta charset="utf-8">')
            file_html.write('<title>Lista de Papers mais importantes sobre ATD</title>')
            file_html.write('</head>')
            file_html.write('<body>')
            for index, row in df_data.iterrows():
                title = '<h2>Título ' + str(index+1) + ': ' + row['title'] + '</h2>'
                file_html.write(title)
                authors = '<h3>Autores: ' + row['authors'] + '</h3>'
                file_html.write(authors)
                year = str(row['year'])
                year = '<h4>Ano: ' + year +  '</h4>'
                file_html.write(year)
                if row['atd'] is not None:
                  abstract = '<p>' + str(row['atd']) + '</p>'
                else:
                    abstract = ''
                file_html.write(abstract)
                file_html.write('<br>')
                if row['link'] is not None:
                  link = '<a href="' + str(row['link']) + '">link</a>'
                else:
                    link = ''
                file_html.write(link)
                file_html.write('<br>')
            file_html.write('</body>')
            file_html.write('</html>')
            print('Arquivo criado com sucesso!')
    except Exception as e: 
        print('Erro {} ao criar o arquivo {}'.format(str(e), arquivo_html))

In [None]:
df_key_authors_year_title_atd = df_key_authors_year_title_atd.sort_values(by='year')

In [None]:
create_html_file(arquivo_html='qualification/html/atd_concepts.html', df_data=df_key_authors_year_title_atd)