In [None]:
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import re
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
with open('uiux_reason.txt') as file:
    original = file.read().split('.')
corpus = original
corpus[0]


In [None]:
stop = set(stopwords.words('english'))

# Remove Punctuation


def remove_punctuation(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r"[.?!,;:-]", " ", text)
    text = re.sub(r'[^\w\s]', "", text)
    text = re.sub(' +', " ", text)
    text = text.strip()
    return text

# Remove Stopwords


def remove_stopwords(text):
    filtered_words = [word.lower()
                      for word in text.split() if word.lower() not in stop]
    if filtered_words != []:
        return " ".join(filtered_words)
    else:
        return(text)


# Get word Tag

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Word lemmatization


def lemmatize_sentence(sentence):
    tokens = word_tokenize(sentence)
    tagged_sent = pos_tag(tokens)

    wnl = WordNetLemmatizer()
    lemmas_sent = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos))  # 詞形還原

    return ' '.join(lemmas_sent)


def sentence_processing(sentence):
    sentence = remove_punctuation(sentence)
    sentence = remove_stopwords(sentence)
    sentence = lemmatize_sentence(sentence)
    return sentence

In [None]:
corpus = [sentence_processing(sentence) for sentence in corpus]
corpus[0]

In [None]:
from cmath import nan
from gensim.models import Word2Vec,KeyedVectors
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',binary=True,limit=100000)
vec_corpus = [i.split() for i in corpus]

def sentence2vec(model,sentence,selfTrain=False):
    tmp = []
    for word in sentence:
        try:
            tmp.append(model[word])
        except:continue
    tmp = np.array(tmp)
    tmp = np.average(tmp,axis=0)
    return tmp

vec_vec=[]
for s in vec_corpus:
    vec_vec.append(sentence2vec(model,s))
    
Word2vec_google_matrix = np.array(vec_vec)[:-1]
Word2vec_google_matrix

In [None]:
# Count Representative Score of sentence by Counting average Cosine Similarity with all vectors
def scoring(main, all):  
    sum = 0
    main = np.array(main).reshape(1, -1)
    all = np.array(all)
    
    for i in all:
        sum += cosine_similarity(main, i.reshape(1, -1))
        
    avg = sum / len(all)
    return avg


# List with Representative Score from highest to lowest
def get_matrix(corpus, matrix):

    score_frame = []

    for main in matrix:
        main_score = scoring(np.array(main).reshape(1, -1), matrix)
        score_frame.append(main_score[0][0])

    matrix = list(zip(score_frame, matrix, corpus))
    matrix.sort(key=lambda y: y[0], reverse=True)

    return matrix

In [None]:
score_refer = get_matrix(corpus, Word2vec_google_matrix)

summa_list = [score_refer[0][2]]
versus = [np.array(score_refer[0][1])]
topics = 0
for i in range(len(Word2vec_google_matrix)):
    if topics <= 3:
        present_chosen = np.array(score_refer[i][1]).reshape(1, -1)
        qual_score = 0
        for former_chosen in versus:
            former_chosen = np.array(former_chosen).reshape(1, -1)
            qual_score += cosine_similarity(present_chosen, former_chosen)

        qual_score = qual_score / len(versus)

        # if present sentences too similar with former sentence than reject it.
        if qual_score[0][0] <= 0.5:
            text = score_refer[i][2]
            original_text = original[int(corpus.index(f'{text}'))]

            print(
                f'Text :\033[0;33;40m {text}\033[0m'.format(text),
                f'Original Text: {original_text}',
                f'Similarity of All : \033[0;34;40m{round(score_refer[i][0],4)}\033[0m',
                f'Qulified Score : {qual_score[0][0]}',
                '',
                sep='\n')

            summa_list.append(score_refer[i][2])
            versus.append(score_refer[i][1])
            topics += 1

        else:
            continue
    else:
        break