In [175]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from pymongo import MongoClient
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import bson

In [176]:
with open('/home/sabinogs/projetos/tcc/dump/test/reviews.bson', 'rb') as b:
    df = pd.DataFrame(bson.decode_all(b.read()))

In [177]:
comentarios = df[df['requirement_type'].isin(['non-functional requirement', 'other', 'functional requirement'])]
df = df[['reviewText','summary','requirement_type']]
df['pre'] = pd.Series()
comentarios = comentarios[['reviewText','summary','requirement_type']]

In [178]:
comentarios.shape

(4784, 3)

In [179]:
def remove_stopwords_sentences(sentences):
    word_without_stop = list()
    for sent in sentences:
        aux = list()
        for word in sent:
            if word not in stopwords.words('english'):
                aux.append(word)
        word_without_stop.append(aux)
    
    return word_without_stop


def remove_stopwords(words):
    return [word for word in words if word not in stopwords.words('english')]

def lemmatize_sentences(sentences):
    lemma = nltk.WordNetLemmatizer()
    lw = list()
    for sent in sentences:
        aux = list()
        for word in sent:
            aux.append(lemma.lemmatize(word))
        lw.append(aux)
    
    return lw

def lemmatize_it(words):
    lemma = nltk.WordNetLemmatizer()
    return [lemma.lemmatize(word) for word in words]


def remove_pon(doc):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(doc)

def _stem_it(doc):
    stem = PorterStemmer()
    return [stem.stem(word) for word in doc]


In [180]:
pre = comentarios['reviewText'].apply(remove_pon).apply(_stem_it).apply(lemmatize_it)
pre_summary = comentarios['summary'].apply(remove_pon).apply(_stem_it).apply(lemmatize_it)

In [181]:
pre = pre + pre_summary

In [182]:
comentarios['pre'] = pre
comentarios['pre'] = comentarios['pre'].apply(lambda x: ' '.join(x))
comentarios.head()

Unnamed: 0,reviewText,summary,requirement_type,pre
0,How dissapointing..downloaded and found it doe...,Extreme Dissappointment,non-functional requirement,how dissapoint download and found it doe not a...
1,This is great for kids my two year old son lov...,great,other,thi is great for kid my two year old son love ...
2,"Loves the song, so he really couldn't wait to ...",Really cute,non-functional requirement,love the song so he realli couldn t wait to pl...
3,My three year old Plays this game the most he ...,Five little monkeys,other,My three year old play thi game the most he lo...
4,As a Speech language pathology Assistant I hav...,My patients request this app everytime they se...,other,As a speech languag patholog assist I have a v...


In [183]:
c = comentarios[comentarios['requirement_type'] != 'other']
# c['pre'] = c['pre'].apply(lambda x: ' '.join(x))

In [184]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html


Para maior referencia, olhe os links acima

In [185]:
X = c['pre'].tolist()
y = c['requirement_type'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [186]:
text_classifier = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('clf',MultinomialNB()),
])

In [187]:
text_classifier.fit(X_train,y_train)
predicted = text_classifier.predict(X_test)

In [188]:
print(classification_report(y_test,predicted))

                            precision    recall  f1-score   support

    functional requirement       0.64      1.00      0.78       273
non-functional requirement       1.00      0.02      0.04       159

                 micro avg       0.64      0.64      0.64       432
                 macro avg       0.82      0.51      0.41       432
              weighted avg       0.77      0.64      0.51       432



# Um Classificador para identificar todas as categorias (Experimento 1)

In [189]:
comentarios['requirement_type'].value_counts()

other                         3347
functional requirement         928
non-functional requirement     509
Name: requirement_type, dtype: int64

In [190]:
X = comentarios['pre'].tolist()
y = comentarios['requirement_type'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

text_classifier_exp1 = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('clf',MultinomialNB()),
])

text_classifier_exp1.fit(X_train,y_train)
predicted_exp1 = text_classifier_exp1.predict(X_test)
print(classification_report(y_test, predicted_exp1))

                            precision    recall  f1-score   support

    functional requirement       1.00      0.01      0.03       288
non-functional requirement       0.00      0.00      0.00       149
                     other       0.70      1.00      0.82       999

                 micro avg       0.70      0.70      0.70      1436
                 macro avg       0.57      0.34      0.28      1436
              weighted avg       0.69      0.70      0.58      1436



  'precision', 'predicted', average, warn_for)


# Nessa seção criarei um classificador para cada Requisito



Isso se dá pela necessidade de aumentar o desempenho do classificador. Para isso, será criado 3 classificadores que: 

1. Identificará se um comentário é do tipo "Funcional" ou não
2. Identificará se um comentário é do tipo "Não Funcional" ou não
3. Identificará se um comentário é do tipo "Outro" ou não


Observe o uso do `DataFrame.copy()`. Isso deve-se ao fato de que ao realizar uma atribuição de dataframe `novoDF = velho` copiamos apenas a referência e isso implicará que mudar uma coisa no `novoDF` muda tbm no `velho`. 



Ref: 

1. https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.copy.html

In [191]:

nao_funcionais = comentarios.copy(deep=True)
outros = comentarios.copy(deep=True)

In [192]:
def transform_dataset(documento, requirement_type):
    if documento == requirement_type:
        return 'sim'
    else:
        return 'nao'

In [193]:



nao_funcionais['requirement_type'] = nao_funcionais['requirement_type'].apply(
    lambda x: transform_dataset(x,'non-functional requirement')
)


outros['requirement_type'] = outros['requirement_type'].apply(
    lambda x: transform_dataset(x,'other')
)


# Experimento 2

## Dataset e Classificador para FUNCIONAIS

In [194]:
funcionais = comentarios.copy(deep=True)
funcionais['requirement_type'] = funcionais['requirement_type'].apply(
    lambda x: transform_dataset(x,'functional requirement')
)

In [195]:
funcionais['requirement_type'].value_counts()

nao    3856
sim     928
Name: requirement_type, dtype: int64

In [196]:
_pre_review = funcionais['reviewText'].apply(remove_pon).apply(_stem_it).apply(lemmatize_it).apply(remove_stopwords)
_pre_summary = funcionais['summary'].apply(remove_pon).apply(_stem_it).apply(lemmatize_it).apply(remove_stopwords)

pre = _pre_review + _pre_summary
funcionais['pre'] = pre
funcionais['pre'] = funcionais['pre'].apply(lambda x: ' '.join(x))

X_funcionais = funcionais['pre'].tolist()
y_funcionais = funcionais['requirement_type'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X_funcionais, y_funcionais, test_size=0.3, random_state=42)

text_classifier_funcionais = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('clf',MultinomialNB()),
])

text_classifier_funcionais.fit(X_train, y_train)
predicted_funcionais = text_classifier_funcionais.predict(X_test)
print(classification_report(y_test, predicted_funcionais))

              precision    recall  f1-score   support

         nao       0.80      1.00      0.89      1148
         sim       1.00      0.01      0.01       288

   micro avg       0.80      0.80      0.80      1436
   macro avg       0.90      0.50      0.45      1436
weighted avg       0.84      0.80      0.71      1436



## Dataset e Classificador para NAO FUNCIONAIS

In [197]:
nao_funcionais = comentarios.copy(deep=True)
nao_funcionais['requirement_type'] = nao_funcionais['requirement_type'].apply(
    lambda x: transform_dataset(x,'non-functional requirement')
)

In [198]:
nao_funcionais['requirement_type'].value_counts()

nao    4275
sim     509
Name: requirement_type, dtype: int64

In [199]:
_pre_review = nao_funcionais['reviewText'].apply(remove_pon).apply(_stem_it).apply(lemmatize_it).apply(remove_stopwords)
_pre_summary = nao_funcionais['summary'].apply(remove_pon).apply(_stem_it).apply(lemmatize_it).apply(remove_stopwords)

pre = _pre_review + _pre_summary
nao_funcionais['pre'] = pre
nao_funcionais['pre'] = nao_funcionais['pre'].apply(lambda x: ' '.join(x))

X_nao_funcionais = nao_funcionais['pre'].tolist()
y_nao_funcionais = nao_funcionais['requirement_type'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X_nao_funcionais, y_nao_funcionais, test_size=0.3, random_state=42)

text_classifier_nao_funcionais = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('clf',MultinomialNB()),
])

text_classifier_nao_funcionais.fit(X_train, y_train)
predicted_nao_funcionais = text_classifier_nao_funcionais.predict(X_test)
print(classification_report(y_test, predicted_nao_funcionais))

              precision    recall  f1-score   support

         nao       0.90      1.00      0.95      1287
         sim       0.00      0.00      0.00       149

   micro avg       0.90      0.90      0.90      1436
   macro avg       0.45      0.50      0.47      1436
weighted avg       0.80      0.90      0.85      1436



  'precision', 'predicted', average, warn_for)


# Experimento 3

## Criando datasets balanceados

### Funcionais

In [200]:
index_array = funcionais[funcionais['requirement_type'] == 'nao'].index
funcionais_to_be_removed = np.random.choice(index_array,int(3*len(index_array)/4))

funcionais_balanceado = funcionais.drop(funcionais_to_be_removed)
funcionais_balanceado['requirement_type'].value_counts()

nao    1864
sim     928
Name: requirement_type, dtype: int64

### Não funcionais

In [201]:
index_array = nao_funcionais[nao_funcionais['requirement_type'] == 'nao'].index
nao_funcionais_to_be_removed = np.random.choice(index_array,int(3*len(index_array)/4))

nao_funcionais_balanceado = nao_funcionais.drop(nao_funcionais_to_be_removed)
nao_funcionais_balanceado['requirement_type'].value_counts()

nao    2009
sim     509
Name: requirement_type, dtype: int64

# Criando os classificadores balanceados

### Funcionais

In [202]:
_pre_review = funcionais_balanceado['reviewText'].apply(remove_pon).apply(_stem_it).apply(lemmatize_it).apply(remove_stopwords)
_pre_summary = funcionais_balanceado['summary'].apply(remove_pon).apply(_stem_it).apply(lemmatize_it).apply(remove_stopwords)

pre = _pre_review + _pre_summary
funcionais_balanceado['pre'] = pre
funcionais_balanceado['pre'] = funcionais_balanceado['pre'].apply(lambda x: ' '.join(x))

X_funcionais = funcionais_balanceado['pre'].tolist()
y_funcionais = funcionais_balanceado['requirement_type'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X_funcionais, y_funcionais, test_size=0.3, random_state=42)

text_classifier_funcionais_balanceados = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('clf',MultinomialNB()),
])

text_classifier_funcionais_balanceados.fit(X_train, y_train)
predicted_funcionais_balanceados = text_classifier_funcionais_balanceados.predict(X_test)
print(classification_report(y_test, predicted_funcionais_balanceados))

              precision    recall  f1-score   support

         nao       0.73      0.99      0.84       577
         sim       0.91      0.19      0.32       261

   micro avg       0.74      0.74      0.74       838
   macro avg       0.82      0.59      0.58       838
weighted avg       0.79      0.74      0.68       838



### Não funcionais

In [203]:
_pre_review = nao_funcionais_balanceado['reviewText'].apply(remove_pon).apply(_stem_it).apply(lemmatize_it).apply(remove_stopwords)
_pre_summary = nao_funcionais_balanceado['summary'].apply(remove_pon).apply(_stem_it).apply(lemmatize_it).apply(remove_stopwords)

pre = _pre_review + _pre_summary
nao_funcionais_balanceado['pre'] = pre
nao_funcionais_balanceado['pre'] = nao_funcionais_balanceado['pre'].apply(lambda x: ' '.join(x))

X_nao_funcionais = nao_funcionais_balanceado['pre'].tolist()
y_nao_funcionais = nao_funcionais_balanceado['requirement_type'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X_nao_funcionais, y_nao_funcionais, test_size=0.3, random_state=42)

text_classifier_nao_funcionais_balanceados = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('clf',MultinomialNB()),
])

text_classifier_nao_funcionais_balanceados.fit(X_train, y_train)
predicted_nao_funcionais_balanceados = text_classifier_nao_funcionais_balanceados.predict(X_test)
print(classification_report(y_test, predicted_nao_funcionais_balanceados))

              precision    recall  f1-score   support

         nao       0.79      1.00      0.88       594
         sim       1.00      0.01      0.01       162

   micro avg       0.79      0.79      0.79       756
   macro avg       0.89      0.50      0.45       756
weighted avg       0.83      0.79      0.69       756



# Predict com uso dos classificadores balanceados.


# Model Persistence

In [204]:
import pickle

In [205]:
# text_classifier_nao_funcionais_balanceados.predict()
# text_classifier_funcionais_balanceados.predict()
# pi = pickle.dumps(text_classifier_funcionais_balanceados)

with open('../classificadores/text_classifier_nao_funcionais_balanceados', 'wb') as file:
    pickle.dump(text_classifier_nao_funcionais_balanceados, file)
    
with open('../classificadores/text_classifier_funcionais_balanceados', 'wb') as file:
    pickle.dump(text_classifier_funcionais_balanceados, file)

In [206]:
with open('../classificadores/text_classifier_funcionais_balanceados', 'rb') as file:
    functional_model = pickle.load(file)
    
with open('../classificadores/text_classifier_nao_funcionais_balanceados', 'rb') as file:
    nao_functional_model = pickle.load(file)

In [207]:
with open('/home/sabinogs/projetos/tcc/dump/test/reviews.bson', 'rb') as b:
    new_comments = pd.DataFrame(bson.decode_all(b.read()))

In [208]:
comments_manually_classified = new_comments[new_comments['requirement_type'].isin(['non-functional requirement', 'other', 'functional requirement'])]

comments_to_classify = new_comments.drop(comments_manually_classified.index)
comments_to_classify = comments_to_classify[['reviewText','summary','requirement_type']]

comments_to_classify = comments_to_classify[:5000]
_reviewText =  comments_to_classify['reviewText'].apply(remove_pon).apply(_stem_it).apply(lemmatize_it).apply(lambda x: ' '.join(x))
_summary =  comments_to_classify['summary'].apply(remove_pon).apply(_stem_it).apply(lemmatize_it).apply(lambda x: ' '.join(x))

comments_to_classify['pre'] = _reviewText + _summary

In [209]:
np.mean(comments_to_classify['reviewText'].str.len())

245.6634

In [210]:
predict_funcional = functional_model.predict(comments_to_classify['pre'])
predict_nao_funcional = nao_functional_model.predict(comments_to_classify['pre']) 

In [211]:
# predict_nao_funcional
comments_to_classify['predict nao funcional'] = predict_nao_funcional
comments_to_classify['predict funcional'] = predict_funcional

In [212]:
def classification_multilabel(document):
    
    if document['predict nao funcional'] == 'sim' and document['predict funcional'] == 'nao':
              return 'non-functional requirement'

    elif document['predict nao funcional'] == 'nao' and document['predict funcional'] == 'sim':
              return 'functional requirement'
            
    elif document['predict nao funcional'] == 'nao' and document['predict funcional'] == 'nao':
              return 'other'
    else:
        return 'non-functional requirement'

In [213]:
comments_to_classify['Predicao Intermediaria'] = comments_to_classify.apply(classification_multilabel, axis=1)
np.unique(comments_to_classify['Predicao Intermediaria'])

array(['functional requirement', 'other'], dtype=object)

In [214]:
comments_to_classify['Predicao Intermediaria'].value_counts()

other                     4709
functional requirement     291
Name: Predicao Intermediaria, dtype: int64

In [215]:
np.unique(predict_nao_funcional)

array(['nao'], dtype='<U3')