In [2]:
import pandas as pd
import re
from trieregex import TrieRegEx as TRE
import spacy
nlp = spacy.load('en_core_web_sm')



In [3]:
# lemmatization function
def lemma(lista):
    lemmatiz = []
    for i in lista:
        doc = nlp(i)
        tokens = []
        for token in doc:
            tokens.append(token)
        lemmatized_sentence = " ".join([token.lemma_ for token in doc])
        lemmatiz.append(lemmatized_sentence)
    return lemmatiz

In [52]:
# import the software reviews dataset (already lemmatized)
df = pd.read_excel(r'..\data\software_review.xlsx')
df = df[["text"]]
df = df.reset_index(drop=True)
df.head(5)

Unnamed: 0,text
0,nice and easy but folder create function be no...
1,no don t install this in your mobile not at al...
2,very speedy app easy to use app that be quick ...
3,I absolutely love this app I have try other ap...
4,I be happy with the app but I have one problem...


In [31]:
# import the lexicon of customer needs (the stemming of system quality attributes)
gaz_need = pd.read_excel(r"..\data\system_q_a.xlsx")
gaz_need.head(5)

Unnamed: 0,word
0,access
1,accountab
2,accura
3,adapt
4,administrab


In [32]:
# import the lexicon of user categories
gaz_user = pd.read_excel(r"..\data\user_lexicon.xlsx")

# lemmatization
gaz_user["user"] = lemma(gaz_user["user"])
gaz_user.head(5)

Unnamed: 0,user
0,technical director
1,metal drawing machine operator
2,precision device inspector
3,air traffic safety technician
4,hospitality revenue manager


<h3> Customer needs extraction </h3>

In [45]:
pattern = [x.lower() for x in gaz_need["word"].tolist()]
pattern = list(set(pattern))
# Initialize class instance
tre = TRE()

# Add word(s)
tre = TRE(*pattern)

# Create regex pattern from the trie
regex_for_words = tre.regex()
regex_for_words_with_b = f'\\b{regex_for_words}\\b'

In [51]:
extracted_concepts = []
extracted_concepts_string = []
for i in df["text"]:
    if type(i) == float:
        found_concept = []
    else:
        found_concept = re.findall(regex_for_words, i)
    extracted_concepts_string.append(",".join(list(set(found_concept))))
    extracted_concepts.append(found_concept)

In [50]:
extracted_concepts

[['availab'],
 ['mobil'],
 ['effecti'],
 ['availab'],
 ['secur', 'secur'],
 ['access', 'secur', 'availab'],
 ['availab'],
 ['usab'],
 ['safe'],
 ['access'],
 ['availab'],
 ['access'],
 ['secur'],
 ['simpl'],
 ['understandab'],
 ['simpl',
  'availab',
  'effici',
  'reliab',
  'failure',
  'predictab',
  'reliab',
  'simpl',
  'effici',
  'effecti',
  'failure',
  'simpl'],
 ['access'],
 ['availab'],
 ['reliab'],
 ['availab'],
 ['reliab'],
 ['access'],
 ['traceab',
  'access',
  'access',
  'access',
  'scalab',
  'scalab',
  'resili',
  'scalab',
  'access',
  'access',
  'access'],
 ['effici'],
 ['failure'],
 ['effecti'],
 ['secur'],
 ['simpl'],
 ['access'],
 ['safe'],
 ['stab', 'mobil'],
 ['usab'],
 ['secur', 'access'],
 ['secur'],
 ['availab', 'access'],
 ['secur'],
 ['availab'],
 ['simpl'],
 ['safe', 'secur'],
 ['simpl'],
 ['secur'],
 ['simpl',
  'access',
  'effecti',
  'access',
  'effecti',
  'access',
  'access',
  'access',
  'secur',
  'simpl',
  'secur',
  'access'],
 ['acce

In [53]:
df["extracted_needs"] = extracted_concepts_string
df

Unnamed: 0,text,extracted_needs
0,nice and easy but folder create function be no...,availab
1,no don t install this in your mobile not at al...,mobil
2,very speedy app easy to use app that be quick ...,effecti
3,I absolutely love this app I have try other ap...,availab
4,I be happy with the app but I have one problem...,secur
...,...,...
1045,hi william could you try to access google driv...,access
1046,easy access and compatible within various rang...,"access,compatib"
1047,hi we be very sorry for the trouble first plea...,availab
1048,timing be everything in this game and follow m...,reliab


<h3> User categories extraction </h3>

In [54]:
pattern = [x.lower() for x in gaz_user["user"].tolist()]
pattern = list(set(pattern))
# Initialize class instance
tre = TRE()

# Add word(s)
tre = TRE(*pattern)

# Create regex pattern from the trie
regex_for_words = tre.regex()
regex_for_words_with_b = f'\\b{regex_for_words}\\b'

In [55]:
extracted_concepts = []
extracted_concepts_string = []
for i in df["text"]:
    if type(i) == float:
        found_concept = []
    else:
        found_concept = re.findall(regex_for_words, i)
    extracted_concepts_string.append(",".join(list(set(found_concept))))
    extracted_concepts.append(found_concept)

In [57]:
df["extracted_users"] = extracted_concepts_string
df.head(5)

Unnamed: 0,text,extracted_needs,extracted_users
0,nice and easy but folder create function be no...,availab,man
1,no don t install this in your mobile not at al...,mobil,man
2,very speedy app easy to use app that be quick ...,effecti,
3,I absolutely love this app I have try other ap...,availab,
4,I be happy with the app but I have one problem...,secur,


In [None]:
# output
df.to_excel(r"..\data\lexicon_output.xlsx")