# Keywords extraction

In [18]:
import nltk
import re

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sveta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sveta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sveta\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
from rake_nltk import Rake

In [3]:
r = Rake()

r.extract_keywords_from_text("Motorcyclist goes for a ride")

r.get_ranked_phrases()

['motorcyclist goes', 'ride']

In [40]:
from rake_nltk import Metric, Rake

r = Rake(language='english')

# If you want to provide your own set of stop words and punctuations to
r = Rake()

# If you want to control the metric for ranking. Paper uses d(w)/f(w) as the
# metric. You can use this API with the following metrics:
# 1. d(w)/f(w) (Default metric) Ratio of degree of word to its frequency.
# 2. d(w) Degree of word only.
# 3. f(w) Frequency of word only.

r = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)
r = Rake(ranking_metric=Metric.WORD_DEGREE)
r = Rake(ranking_metric=Metric.WORD_FREQUENCY)

# If you want to control the max or min words in a phrase, for it to be
# considered for ranking you can initialize a Rake instance as below:

r = Rake(min_length=1, max_length=2)

r.extract_keywords_from_text("I wish I'd discovered this show earlier")

r.get_ranked_phrases()

['show earlier', 'wish', 'discovered']

# Lemmatization

In [2]:
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
  
print("rocks :", lemmatizer.lemmatize("rocks")) 
print("corpora :", lemmatizer.lemmatize("corpora")) 
  
# a denotes adjective in "pos" 
print("better :", lemmatizer.lemmatize("better", pos ="a"))

print("Random word :", lemmatizer.lemmatize("being", pos = 'r'))

rocks : rock
corpora : corpus
better : good
Random word : being


In [11]:
from nltk.corpus import wordnet as wn

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']


def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']


def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']


def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None

In [29]:
from nltk.tokenize import word_tokenize

text = word_tokenize("Hello welcome to the world of to learn Categorizing and POS Tagging with NLTK and Python")

tagged = nltk.pos_tag(text)

for tag in tagged:
    print(tag)

('Hello', 'NNP')
('welcome', 'NN')
('to', 'TO')
('the', 'DT')
('world', 'NN')
('of', 'IN')
('to', 'TO')
('learn', 'VB')
('Categorizing', 'NNP')
('and', 'CC')
('POS', 'NNP')
('Tagging', 'NNP')
('with', 'IN')
('NLTK', 'NNP')
('and', 'CC')
('Python', 'NNP')


In [61]:
def preprocess_lemmatize(text):
    
    text=text.lower()
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    # words = text.split()
    
    tokenized = word_tokenize(text)
    
    tagged_words = nltk.pos_tag(tokenized)
    # print(tagged_words)
    
    text = ""
    
    for tagged_word in tagged_words:
        
        pos_tag = penn_to_wn(tagged_word[1])
        
        # print(pos_tag)
        
        if pos_tag == None:
            text += tagged_word[0] + " "
            continue
        
        lemmatized_word = lemmatizer.lemmatize(tagged_word[0], pos_tag)
        
        text += lemmatized_word + " "
    
    return text

In [62]:
preprocess_lemmatize("I was an expert at master waiting.")

'i be an expert at master wait '

# TF-IDF

## Loading data

In [63]:
import mysql.connector

connection = mysql.connector.connect(host='localhost',
                                     database='9gag',
                                     user='root',
                                     password='root',)
if connection.is_connected():
    db_Info = connection.get_server_info()
    print("Connected to MySQL Server version ", db_Info)
    cursor = connection.cursor()
    cursor.execute("select database();")
    record = cursor.fetchone()
    print("You're connected to database: ", record)


Connected to MySQL Server version  8.0.16
You're connected to database:  ('9gag',)


In [64]:
import pandas as pd

sql_query = "SELECT id, title FROM actual_post"

df_idf=pd.read_sql(sql_query, connection)
 
# print schema
print("Schema:\n\n",df_idf.dtypes)
print("Number of questions,columns=",df_idf.shape)

Schema:

 id       object
title    object
dtype: object
Number of questions,columns= (6038, 2)


In [65]:
df_idf

Unnamed: 0,id,title
0,a0Nv0Pd,Hat trick
1,a0Nv1bq,What happened?
2,a0Nv1jn,Do you miss the Neinties ?
3,a0Nv3ed,A different perspective
4,a0Nv4Yv,One of the worst moments of pc building
...,...,...
6033,aZyqYGX,Doomguy teaches Isabelle his techniques
6034,aZyqYm9,Clear lemon pie
6035,aZyqYNn,This entire thing was made with color pencils!
6036,aZyqYVn,Quarantine hacks gone wrong


## Preprocessing

In [66]:
import re
def preprocess(text):
    
    text=text.lower()
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [67]:
df_idf['text'] = df_idf['title']
df_idf['text'] = df_idf['text'].apply(lambda x: preprocess_lemmatize(x))

df_idf['text'][20]

'gager detect '

## Create vocabulary and word count

In [68]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def read_stopwords(stopwords_file='english_stopwords.txt'):
    
    with open(stopwords_file, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

stopwords = read_stopwords()
    
docs=df_idf['text'].tolist()

cv=CountVectorizer(max_df=0.25,stop_words=stopwords)
# cv=CountVectorizer(max_df=0.25,stop_words='english') # use max_features=<limit> to limit vocabulary size
word_count_vector=cv.fit_transform(docs)

word_count_vector.shape

(6038, 6009)

In [69]:
list(cv.vocabulary_.keys())[:10]

['hat',
 'trick',
 'happen',
 'miss',
 'neinties',
 'different',
 'perspective',
 'one',
 'bad',
 'moment']

In [70]:
from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [71]:
docs_test=df_idf['text'].tolist()

In [72]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    
    if topn == -1:
        topn = len(sorted_items)
    
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [73]:
feature_names=cv.get_feature_names()
 
doc=docs_test[20]
doc = "I wish I'd discovered this show earlier"
 
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
 
sorted_items=sort_coo(tf_idf_vector.tocoo())
 
keywords=extract_topn_from_vector(feature_names,sorted_items,-1)
 
print("\nTitle:")
print(doc)
print("\nKeywords list:")
for k in keywords:
    print(k,keywords[k])


Title:
I wish I'd discovered this show earlier

Keywords list:
earlier 0.659
wish 0.572
show 0.487


In [74]:
def extract_keywords_tfidf(original, lemmatized):
    
    tf_idf_vector=tfidf_transformer.transform(cv.transform([lemmatized]))
 
    sorted_items=sort_coo(tf_idf_vector.tocoo())
 
    keywords=extract_topn_from_vector(feature_names,sorted_items,-1)
    
    # print("\nTitle:")
    # print(original)
    # print("\nKeywords list:")
    # for k in keywords:
        # print(k,keywords[k])
        
    return list(keywords.keys())

In [78]:
def extract_keywords_tfidf_as_map(original, lemmatized):
    
    tf_idf_vector=tfidf_transformer.transform(cv.transform([lemmatized]))
 
    sorted_items=sort_coo(tf_idf_vector.tocoo())
 
    keywords=extract_topn_from_vector(feature_names,sorted_items,-1)
    
    # print("\nTitle:")
    # print(original)
    # print("\nKeywords list:")
    # for k in keywords:
        # print(k,keywords[k])
        
    return keywords

In [90]:
def map_keywords(keywords):
    
    keywords_with_scores = []
    
    for key, value in keywords.items():
        keywords_with_scores.append(key + '|' + str(value))
    
    return keywords_with_scores

In [92]:
title = "I wish I'd discovered this show earlier"

lemmatized_title = preprocess_lemmatize(title)

extract_keywords_tfidf(title, lemmatized_title)
kw = extract_keywords_tfidf_as_map(title, lemmatized_title)

lista = map_keywords(kw)

print(lista)

['earlier|0.559', 'discover|0.531', 'wish|0.485', 'show|0.413']


In [113]:
if (connection.is_connected()):
    cursor.close()
    connection.close()
    print("MySQL connection is closed")

MySQL connection is closed


# Extract keywords

In [94]:
import csv

with open('keywords_v2.csv', 'w', newline='') as keywords_file:
    
    writer = csv.writer(keywords_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)  
    
    sql_query = "SELECT id, title FROM actual_post"

    df_titles=pd.read_sql(sql_query, connection)
    
    most_keywords = 0
    
    for index, row in df_titles.iterrows():
        # print(row['id'], row['title'])
        
        title = row['title']

        lemmatized_title = preprocess_lemmatize(title)

        keywords = extract_keywords_tfidf_as_map(title, lemmatized_title)
        
        if len(keywords) > most_keywords:
            most_keywords = len(keywords)
        
        write_data = [row['id']]
        write_data.append(row['title'])
        write_data.append(len(keywords))
        write_data.extend(map_keywords(keywords))
        
        try: 
            writer.writerow(write_data)
        except:
            print("Can't write :", write_data)
        
    print("Most keywords :", most_keywords)

Can't write : ['a0NvBXQ', 'Michiroみちろ', 1, 'michiroみちろ|1.0']
Can't write : ['a2WEYQY', 'Hii❤', 1, 'hii|1.0']
Can't write : ['a7WvRDr', 'Kĕniæl, the Overseer', 2, 'overseer|0.707', 'kĕniæl|0.707']
Can't write : ['aBm9Adx', "ಠ_ಠ...When I read this news.I don't think they're UFOs...ಠ_ಠ", 5, 'ಠ_ಠ|0.795', 'ufos|0.367', 'read|0.294', 'news|0.29', 'think|0.251']
Can't write : ['aD4RNr9', 'Friends furever ♡', 2, 'furever|0.839', 'friend|0.543']
Can't write : ['aGdKm87', 'ステイホーム', 1, 'ステイホーム|1.0']
Can't write : ['aj9Qnr8', '¯\\_(ツ)_/¯ Well', 1, 'well|1.0']
Can't write : ['an4ePOL', 'Well ¯\\_(ツ)_/¯', 1, 'well|1.0']
Can't write : ['aNgm7Qv', 'Quarintine ️', 1, 'quarintine|1.0']
Can't write : ['aqn40Lp', 'The new suprime leader is kinda cute ❤', 5, 'suprime|0.55', 'kinda|0.494', 'cute|0.419', 'leader|0.416', 'new|0.325']
Can't write : ['aXgZ6dD', 'ᵒʰ ⁿᵒ', 2, 'ⁿᵒ|0.707', 'ᵒʰ|0.707']
Can't write : ['aYyj0qN', 'The best of friends ♡', 2, 'friend|0.717', 'best|0.698']
Most keywords : 28


In [77]:
df_titles['title']

0                                            Hat trick
1                                       What happened?
2                           Do you miss the Neinties ?
3                              A different perspective
4              One of the worst moments of pc building
                             ...                      
6033           Doomguy teaches Isabelle his techniques
6034                                   Clear lemon pie
6035    This entire thing was made with color pencils!
6036                       Quarantine hacks gone wrong
6037                     Dunder-Mifflin this is Brazil
Name: title, Length: 6038, dtype: object

# Finding keywords with highest sum of weigths

In [118]:
with open('keywords_v2.csv', 'r') as keywords_file:
    lines = list(csv.reader(keywords_file))
    
    counter_dict = {}
    weigths_dict = {}
    
    for line in lines:
        words_data = line[3:]
        
        for word_data in words_data:
            word, weigth = word_data.split('|')

            # print(word + ' - ' + weigth)
            
            if word in counter_dict:
                counter_dict[word] += 1
                weigths_dict[word] += float(weigth)
            else:
                counter_dict[word] = 1
                weigths_dict[word] = float(weigth)
        

In [135]:
counter_dict_sorted = {k: v for k, v in sorted(counter_dict.items(), key=lambda item: item[1], reverse=True)}

# print(counter_dict_sorted)

thresh = 20

print(str(thresh) + " most frequent keywords :")

for index, item in enumerate(counter_dict_sorted.items()):
    
    if index == thresh:
        break
    
    print(item)
    # print("'" + item[0] + "', ", end='')

20 most frequent keywords :
('get', 173)
('like', 157)
('go', 154)
('make', 146)
('one', 132)
('quarantine', 121)
('time', 115)
('good', 112)
('know', 107)
('day', 100)
('guy', 98)
('see', 86)
('look', 85)
('new', 77)
('people', 74)
('right', 73)
('old', 71)
('say', 70)
('year', 69)
('still', 65)


In [120]:
weigths_dict_sorted = {k: v for k, v in sorted(weigths_dict.items(), key=lambda item: item[1], reverse=True)}

for index, item in enumerate(weigths_dict_sorted.items()):
    
    if index == thresh:
        break
    
    print(item)

('like', 59.747000000000014)
('get', 59.47200000000001)
('go', 57.97099999999999)
('quarantine', 54.42300000000004)
('know', 52.456)
('one', 50.958000000000006)
('make', 48.033999999999985)
('good', 47.760000000000005)
('time', 46.27599999999999)
('day', 39.937)
('true', 36.94399999999999)
('guy', 36.74299999999999)
('see', 34.21000000000001)
('well', 31.456999999999994)
('look', 31.152000000000008)
('say', 30.077)
('right', 30.036999999999995)
('year', 28.385999999999996)
('new', 27.474999999999994)
('old', 26.935999999999993)


## Relative importance

In [129]:
relative_importance_dict = {}

for index, item in enumerate(counter_dict_sorted.items()):
    
    if item[1] > 10:
        relative_importance_dict[item[0]] = weigths_dict_sorted[item[0]] / item[1]
    

In [130]:
relative_importance_sorted = {k: v for k, v in sorted(relative_importance_dict.items(), key=lambda item: item[1], reverse=True)}

for index, item in enumerate(relative_importance_sorted.items()):
    
    if index == thresh:
        break
    
    print(item)

('hmmm', 0.9589523809523809)
('fact', 0.7760000000000001)
('true', 0.7243921568627448)
('okay', 0.7020000000000001)
('classic', 0.7019230769230768)
('wholesome', 0.7014705882352941)
('doggo', 0.6958461538461538)
('welcome', 0.6926153846153846)
('wtf', 0.6918947368421052)
('title', 0.68875)
('fine', 0.6886666666666665)
('stupid', 0.6856428571428571)
('smile', 0.6851538461538461)
('accurate', 0.6764375)
('legend', 0.6712727272727274)
('russia', 0.6705000000000002)
('lol', 0.6596875)
('worth', 0.6566923076923078)
('wait', 0.6534687500000002)
('bro', 0.6436153846153846)
