#Importamos librerías necesarias

In [None]:
import pandas as pd
import numpy as np
import random
import math
import re
import warnings

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm, naive_bayes
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#Cargamos la base de datos y definimos las categorías existentes en ella

# Definimos función que preprocesa la base de datos

In [None]:
def preprocess(sentence):
    # elimina puntuación
    new_text = re.sub(r'[^\w\s]', '', sentence)
    # elimina números
    new_text = re.sub(r'\d+','',new_text) 
    # minúsculas
    new_text = new_text.lower() 
    # dividimos en tokens
    tokens = nltk.tokenize.word_tokenize(new_text)
    # eliminamos stopwords
    tokens = [word for word in tokens if not word in nltk.corpus.stopwords.words('english')]
    # lemmatizamos
    # lemmatizer = nltk.stem.WordNetLemmatizer()
    # new_text = ' '.join([lemmatizer.lemmatize(w) for w in tokens])
    # stemming
    stemmer = nltk.stem.PorterStemmer()
    new_text = ' '.join([stemmer.stem(w) for w in tokens])
    
    # new_text = ' '.join(tokens)

    return new_text

# Definimos función que transforma el texto preprocesado en un vector de números

In [None]:
def useTfIdf(db, i_test, f_test):
    train = db.drop(range(i_test, f_test))
    test = db[i_test:f_test]

    vectorizer = TfidfVectorizer()

    vectorizer.fit(list(train['sentences'].values))

    x_train = vectorizer.transform(list(train['sentences'].values))
    y_train = train.drop(labels=['sentences'], axis=1)
    x_test = vectorizer.transform(list(test['sentences'].values))
    y_test = test.drop(labels=['sentences'], axis=1)

    return x_train, y_train, x_test, y_test

# Entrenamos modelo, validamos y mostramos resultados de precisión, sensibilidad y F1

In [None]:
warnings.filterwarnings("ignore")
RUTA_DB = "./drive/My Drive/xlsx/db.xlsx"
db = pd.read_excel(RUTA_DB)
categories = [
        "access control", "audit", "availability", "legal", "look and feel",
        "maintainability", "operational", "privacy", "recoverability", "capacity and performance",
        "reliability", "security", "usability", "other nonfunctional", "functional", "not applicable"
    ]

k = 10
db['sentences'] = db['sentences'].apply(preprocess)
db = db.dropna()

mean_precision = [[0]*16]*k
mean_recall = [[0]*16]*k
mean_f1score = [[0]*16]*k

for iter in range(iterations):
    precision = [0]*len(categories)
    recall = [0]*len(categories)
    f1score = [0]*len(categories)

    db = db.sample(frac=1).reset_index(drop=True)

    i_test = 0 
    f_test = math.floor(len(db)/k)


    for index_k in range(k):
        precision = [0]*len(categories)
        recall = [0]*len(categories)
        f1score = [0]*len(categories)
        x_train, y_train, x_test, y_test = useTfIdf(db, i_test, f_test)

        # model = KNeighborsClassifier(n_neighbors=1)
        model = svm.LinearSVC()
        # model = naive_bayes.MultinomialNB()
        # model = RandomForestClassifier(n_jobs=-1)

        i = 0
        for category in categories:
            model.fit(x_train, y_train[category])
            prediction = model.predict(x_test)
            precision[i] += precision_score(y_test[category], prediction)
            recall[i] += recall_score(y_test[category], prediction)
            f1score[i] += f1_score(y_test[category], prediction)
            i += 1
        
        i_test = f_test
        f_test = i_test + math.floor(len(db)/k)
        
        mean_precision[index_k] = precision
        mean_recall[index_k] = recall
        mean_f1score[index_k] = f1score

print("Precisión")
print(np.average(mean_precision, axis=0))

print("\nSensibilidad")
print(np.average(mean_recall, axis=0))

print("\nF1")
print(np.average(mean_f1score, axis=0))

Precision
[0.79861144 0.79543088 0.63333333 0.86223332 0.90833333 0.79518673
 0.72585082 0.81060593 0.69166667 0.90321429 0.25       0.79188775
 0.93571429 0.31666667 0.8718735  0.91403368]

Recall
[0.61873664 0.46420126 0.2424359  0.45668281 0.30889971 0.450104
 0.34703541 0.4524161  0.30095238 0.57819112 0.10333333 0.51603175
 0.40612554 0.06746753 0.83735831 0.92396528]

F1 Score
[0.6955894  0.58225447 0.33864613 0.58114454 0.44056999 0.57307822
 0.4562281  0.57358279 0.38621212 0.68643978 0.14       0.62289839
 0.56196018 0.10621212 0.85416724 0.91891778]
