<a href="https://colab.research.google.com/github/Technical-Debt-Large-Scale/qualification/blob/main/python/analysis/extractionatd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configuração

In [None]:
!git clone https://github.com/Technical-Debt-Large-Scale/qualification.git

In [3]:
import os
import pandas as pd

pd.set_option('display.max_colwidth', 500)
file_xls_to_scan = 'qualification/xls/mergepatdsp.xls'

# Leitura dos dados

In [None]:
!pip install --upgrade xlrd

In [4]:
df_my_xls_data = pd.read_excel(file_xls_to_scan)

In [5]:
df_data = df_my_xls_data
df_bib_sort_by_year = df_data.copy()
df_bib_sort_by_year['year'] = pd.to_numeric(df_bib_sort_by_year['year'])
df_bib_sort_by_year = df_bib_sort_by_year.sort_values('year')

df_data = df_bib_sort_by_year.copy()

In [None]:
df_data.head(3)

# Analisa os conteudos dos Abstracts

In [None]:
# Remove os abstracts vazios
df_data = df_data[['key', 'year', 'list_authors', 'title', 'abstract']]
df_data = df_data.dropna()
df_data.head(3)

In [None]:
# Instala a suite NLTK https://www.nltk.org/
!pip install nltk

In [None]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
# Concatena todos os abstracts em uma unica sentenca
list_abstract = df_data.abstract.to_list()
all_abstract_in_one = ' '.join(list_abstract)

# Tokenization
tokens_abstract = nltk.word_tokenize(all_abstract_in_one)
tokens_abstract

In [None]:
# Converte todos os tokens de todos os abstracts em minusculo
[item.lower() for item in tokens_abstract]

In [None]:
# Recupera regras de frases em ingles
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
tokenizer.tokenize('Hello.  This is a test.  It works!')

In [None]:
# Guarda em uma lista todas as frases que contem Architectural Technical Debt
list_atd = []
for abstract in list_abstract:
  list_temp = tokenizer.tokenize(abstract)
  for item in list_temp:
    item_to_lower = item.lower()
    atd_lower = 'Architectural Technical Debt'.lower()
    if atd_lower in item_to_lower:
      list_atd.append(item)

In [None]:
# Faz o merge de todos as sentencas que contem ATD
all_atd_abstract_in_one = ' '.join(list_atd)

# Tokenization
tokens_atd_abstract = nltk.word_tokenize(all_atd_abstract_in_one)

# Lista de todos os tokens de todos os atd abstracts em minusculo
list_of_tokens_atd_abstract = [item.lower() for item in tokens_atd_abstract]

In [None]:
# Faz a contagem das palavras e guarda em um dicionario
from collections import Counter

counts_atd_words = Counter(list_of_tokens_atd_abstract)
counts_atd_words

dict_counts_atd_words = dict(counts_atd_words)

In [None]:
# Ordena o dicionario de palavras por valor crescente
# Mostra as palavras que mais se repetem
sorted(dict_counts_atd_words.items(), key=lambda x: x[1])

## Remove os stop words

all_atd_abstract_in_one

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))

tokenizer = nltk.RegexpTokenizer(r"\w+")
new_words = tokenizer.tokenize(all_atd_abstract_in_one)

#word_tokens = word_tokenize(all_atd_abstract_in_one)
word_tokens = new_words 

filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
 
#filtered_sentence = []
 
#for w in word_tokens:
#    if w not in stop_words:
#        filtered_sentence.append(w)
 
print(word_tokens)
print(filtered_sentence)

In [None]:
# Lista de todos os tokens de todos os atd abstracts em minusculo sem o stop words
list_of_tokens_atd_abstract_no_ponctuation = [item.lower() for item in filtered_sentence]

# Faz a contagem das palavras e guarda em um dicionario
from collections import Counter

counts_atd_words_no_ponctuation = Counter(list_of_tokens_atd_abstract_no_ponctuation)

dict_counts_atd_words_no_ponctuation = dict(counts_atd_words_no_ponctuation)

In [None]:
# Ordena o dicionario de palavras, sem os stop words, por valor crescente 
sorted(dict_counts_atd_words_no_ponctuation.items(), key=lambda x: x[1])

## Word *cloud*

!git clone https://github.com/amueller/word_cloud.git

In [None]:
!cd word_cloud && pip install .

In [None]:
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

In [None]:
# Generate a Word of Cloud about each file according frequence
def generateWordCloud(name, counterWithFrequency, my_path='qualification/wordcloud'):
  try:
    wordcloud = WordCloud(width = 1200, height = 1000, random_state=1, background_color='black', colormap='Set2', collocations=False)
    wordcloud.generate_from_frequencies(frequencies=counterWithFrequency)
    # Display the generated image:
    fileName = my_path + '/' + name + ".png"
    wordcloud.to_file(fileName)
    print(f'Arquivo {fileName} gerado com sucesso!')
  except Exception as ex:
    print(f'Erro ao gerar o arquivo {name}')

# Generate a Word of Cloud about text
def generateWordCloud2(name, my_text, my_path='qualification/wordcloud'):
  try:
    wordcloud = WordCloud(width = 1200, height = 1000, random_state=1, background_color='black', colormap='Set2', collocations=False)
    wordcloud.generate(my_text)
    # Display the generated image:
    fileName = my_path + '/' + name + ".png"
    wordcloud.to_file(fileName)
    print(f'Arquivo {fileName} gerado com sucesso!')
  except Exception as ex:
    print(f'Erro ao gerar o arquivo {name}')

# Generate a Word of Cloud about text
def generateWordCloud3(name, my_text, my_path='qualification/wordcloud'):
  try:
    wordcloud = WordCloud(width=1200, height=1000).generate(my_text)
    wordcloud.generate(my_text)
    # Display the generated image:
    fileName = my_path + '/' + name + ".png"
    wordcloud.to_file(fileName)
    print(f'Arquivo {fileName} gerado com sucesso!')
  except Exception as ex:
    print(f'Erro ao gerar o arquivo {name}')

In [None]:
generateWordCloud('word_cloud_atd_no_ponctuation', dict_counts_atd_words_no_ponctuation)
generateWordCloud2('word_cloud_atd_all_in_one', all_atd_abstract_in_one)
generateWordCloud3('word_cloud_atd_all_in_one2', all_atd_abstract_in_one)