In [10]:
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import re
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [40]:
with open('uiux_reason.txt') as file:
    original = file.read().split('.')
corpus = original
corpus[0]


'User experience (UX) and user interface (UI) designers are in high demand because they play a critical role in how people interact with technology'

### Text Cleaning

In [11]:
stop = set(stopwords.words('english'))

# Remove Punctuation


def remove_punctuation(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r"[.?!,;:-]", " ", text)
    text = re.sub(r'[^\w\s]', "", text)
    text = re.sub(' +', " ", text)
    text = text.strip()
    return text

# Remove Stopwords


def remove_stopwords(text):
    filtered_words = [word.lower()
                      for word in text.split() if word.lower() not in stop]
    if filtered_words != []:
        return " ".join(filtered_words)
    else:
        return(text)


# Get word Tag

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Word lemmatization


def lemmatize_sentence(sentence):
    tokens = word_tokenize(sentence)
    tagged_sent = pos_tag(tokens)

    wnl = WordNetLemmatizer()
    lemmas_sent = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos))  # 詞形還原

    return ' '.join(lemmas_sent)


def sentence_processing(sentence):
    sentence = remove_punctuation(sentence)
    sentence = remove_stopwords(sentence)
    sentence = lemmatize_sentence(sentence)
    return sentence


In [24]:
corpus = [sentence_processing(sentence) for sentence in corpus]
corpus[0]


'user experience ux user interface ui designer high demand play critical role people interact technology'

### Summarization

##### TFIDF Score Vectorize Sentences

In [13]:
vectorize = CountVectorizer()
transformer = TfidfTransformer()

# Use TFIDE Score of each words to build sentences Vector (sentences, unique words)
tf_result = transformer.fit_transform(
    vectorize.fit_transform(corpus)).toarray().tolist()


In [14]:
# Count Representative Score of sentence by Counting average Cosine Similarity with all vectors
def scoring(main, all):  
    sum = 0
    main = np.array(main).reshape(1, -1)
    all = np.array(all)

    for i in all:
        sum += cosine_similarity(main, i.reshape(1, -1))
    avg = sum / len(all)
    return avg


# List with Representative Score from highest to lowest
def get_Score_refer(corpus, tf_result):

    score_frame = []

    for main in tf_result:
        main_score = scoring(main, tf_result)
        score_frame.append(main_score[0][0])

    score_refer = list(zip(score_frame, tf_result, corpus))
    score_refer.sort(key=lambda y: y[0], reverse=True)

    return score_refer


In [38]:

# return tfidf vector & text of highest similarity sentence to all sentences
score_refer = get_Score_refer(corpus, tf_result)

summa_list = [score_refer[0][2]]
versus = [np.array(score_refer[0][1])]
topics = 0
for i in range(len(corpus)):
    if topics <= 3:
        present_chosen = np.array(score_refer[i][1]).reshape(1, -1)
        qual_score = 0
        for former_chosen in versus:
            former_chosen = np.array(former_chosen).reshape(1, -1)
            qual_score += cosine_similarity(present_chosen, former_chosen)

        qual_score = qual_score / len(versus)

        # if present sentences too similar with former sentence than reject it.
        if qual_score[0][0] <= score_refer[i][0] and qual_score[0][0] <= 0.05:
            text = score_refer[i][2]
            original_text = original[int(corpus.index(f'{text}'))]

            print(
                f'Text :\033[0;33;40m {text}\033[0m'.format(text),
                f'Original Text: {original_text}',
                f'Similarity of All : \033[0;34;40m{round(score_refer[i][0],4)}\033[0m',
                f'Qulified Score : {qual_score[0][0]}',
                '',
                sep='\n')

            summa_list.append(score_refer[i][2])
            versus.append(score_refer[i][1])
            topics += 1

        else:
            continue
    else:
        break


Text :[0;33;40m  They design how the product looks, making sure that it is visually appealing and easy to navigate[0m
Original Text:  They design how the product looks, making sure that it is visually appealing and easy to navigate
Similarity of All : [0;34;40m0.0666[0m
Qulified Score : 0.0

Text :[0;33;40m  They take into account how users interact with a product, what they need and want from it, and how they would like it to work[0m
Original Text:  They take into account how users interact with a product, what they need and want from it, and how they would like it to work
Similarity of All : [0;34;40m0.0666[0m
Qulified Score : 0.04046643076267294

Text :[0;33;40m  In addition, they are able to communicate with developers to ensure that the final product meets the requirements of both the users and the business[0m
Original Text:  In addition, they are able to communicate with developers to ensure that the final product meets the requirements of both the users and the busines