# Keywords extraction

In [50]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sveta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sveta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [2]:
from rake_nltk import Rake

In [3]:
r = Rake()

r.extract_keywords_from_text("Motorcyclist goes for a ride")

r.get_ranked_phrases()

['motorcyclist goes', 'ride']

In [40]:
from rake_nltk import Metric, Rake

r = Rake(language='english')

# If you want to provide your own set of stop words and punctuations to
r = Rake()

# If you want to control the metric for ranking. Paper uses d(w)/f(w) as the
# metric. You can use this API with the following metrics:
# 1. d(w)/f(w) (Default metric) Ratio of degree of word to its frequency.
# 2. d(w) Degree of word only.
# 3. f(w) Frequency of word only.

r = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)
r = Rake(ranking_metric=Metric.WORD_DEGREE)
r = Rake(ranking_metric=Metric.WORD_FREQUENCY)

# If you want to control the max or min words in a phrase, for it to be
# considered for ranking you can initialize a Rake instance as below:

r = Rake(min_length=1, max_length=2)

r.extract_keywords_from_text("I wish I'd discovered this show earlier")

r.get_ranked_phrases()

['show earlier', 'wish', 'discovered']

# Lemmatization

In [91]:
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
  
print("rocks :", lemmatizer.lemmatize("rocks")) 
print("corpora :", lemmatizer.lemmatize("corpora")) 
  
# a denotes adjective in "pos" 
print("better :", lemmatizer.lemmatize("better", pos ="a"))

print("Random word :", lemmatizer.lemmatize("being", pos = 'r'))

rocks : rock
corpora : corpus
better : good
Random word : being


In [101]:
def preprocess_lemmatize(text):
    
    text=text.lower()
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    words = text.split()
    
    text = ""
    
    for word in words:
        
        for pos_tag in ['n', 'v', 'a', 'r']:
            
            lemmatized_word = lemmatizer.lemmatize(word, pos_tag)

            if lemmatized_word != word:
                # print(lemmatized_word)
                break
                
        text += lemmatized_word + " "
    
    return text

In [144]:
preprocess_lemmatize("I am an expert at master waiting.")

'i be an expert at master wait '

# TF-IDF

## Loading data

In [165]:
import mysql.connector

connection = mysql.connector.connect(host='localhost',
                                     database='9gag',
                                     user='root',
                                     password='root',)
if connection.is_connected():
    db_Info = connection.get_server_info()
    print("Connected to MySQL Server version ", db_Info)
    cursor = connection.cursor()
    cursor.execute("select database();")
    record = cursor.fetchone()
    print("You're connected to database: ", record)


Connected to MySQL Server version  8.0.16
You're connected to database:  ('9gag',)


In [2]:
import pandas as pd

sql_query = "SELECT id, title FROM actual_post"

df_idf=pd.read_sql(sql_query, connection)
 
# print schema
print("Schema:\n\n",df_idf.dtypes)
print("Number of questions,columns=",df_idf.shape)

Schema:

 id       object
title    object
dtype: object
Number of questions,columns= (6038, 2)


In [3]:
df_idf

Unnamed: 0,id,title
0,a0Nv0Pd,Hat trick
1,a0Nv1bq,What happened?
2,a0Nv1jn,Do you miss the Neinties ?
3,a0Nv3ed,A different perspective
4,a0Nv4Yv,One of the worst moments of pc building
...,...,...
6033,aZyqYGX,Doomguy teaches Isabelle his techniques
6034,aZyqYm9,Clear lemon pie
6035,aZyqYNn,This entire thing was made with color pencils!
6036,aZyqYVn,Quarantine hacks gone wrong


## Preprocessing

In [103]:
import re
def preprocess(text):
    
    text=text.lower()
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [104]:
df_idf['text'] = df_idf['title']
df_idf['text'] = df_idf['text'].apply(lambda x: preprocess_lemmatize(x))

df_idf['text'][20]

'gager detect '

## Create vocabulary and word count

In [128]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def read_stopwords(stopwords_file='english_stopwords.txt'):
    
    with open(stopwords_file, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

stopwords = read_stopwords()
    
docs=df_idf['text'].tolist()

cv=CountVectorizer(max_df=0.25,stop_words=stopwords)
# cv=CountVectorizer(max_df=0.25,stop_words='english') # use max_features=<limit> to limit vocabulary size
word_count_vector=cv.fit_transform(docs)

word_count_vector.shape

(6038, 5771)

In [129]:
list(cv.vocabulary_.keys())[:10]

['hat',
 'trick',
 'happen',
 'miss',
 'neinties',
 'different',
 'perspective',
 'one',
 'bad',
 'moment']

In [130]:
from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [131]:
docs_test=df_idf['text'].tolist()

In [132]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    
    if topn == -1:
        topn = len(sorted_items)
    
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [133]:
feature_names=cv.get_feature_names()
 
doc=docs_test[20]
doc = "I wish I'd discovered this show earlier"
 
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
 
sorted_items=sort_coo(tf_idf_vector.tocoo())
 
keywords=extract_topn_from_vector(feature_names,sorted_items,-1)
 
print("\nTitle:")
print(doc)
print("\nKeywords list:")
for k in keywords:
    print(k,keywords[k])


Title:
I wish I'd discovered this show earlier

Keywords list:
wish 0.761
show 0.648


In [192]:
def extract_keywords_tfidf(original, lemmatized):
    
    tf_idf_vector=tfidf_transformer.transform(cv.transform([lemmatized]))
 
    sorted_items=sort_coo(tf_idf_vector.tocoo())
 
    keywords=extract_topn_from_vector(feature_names,sorted_items,-1)
    
    # print("\nTitle:")
    # print(original)
    # print("\nKeywords list:")
    # for k in keywords:
        # print(k,keywords[k])
        
    return list(keywords.keys())

In [193]:
title = "He strikes again"

lemmatized_title = preprocess_lemmatize(title)

extract_keywords_tfidf(title, lemmatized_title)

['strike']

In [113]:
if (connection.is_connected()):
    cursor.close()
    connection.close()
    print("MySQL connection is closed")

MySQL connection is closed


# Extract keywords

In [206]:
import csv

with open('keywords.csv', 'w', newline='') as keywords_file:
    
    writer = csv.writer(keywords_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)  
    
    sql_query = "SELECT id, title FROM actual_post"

    df_titles=pd.read_sql(sql_query, connection)
    
    most_keywords = 0
    
    for index, row in df_titles.iterrows():
        # print(row['id'], row['title'])
        
        title = row['title']

        lemmatized_title = preprocess_lemmatize(title)

        keywords = extract_keywords_tfidf(title, lemmatized_title)
        
        if len(keywords) > most_keywords:
            most_keywords = len(keywords)
        
        write_data = [row['id']]
        write_data.append(row['title'])
        write_data.append(len(keywords))
        write_data.extend(keywords)
        
        try: 
            writer.writerow(write_data)
        except:
            print("Can't write :", write_data)
        
    print("Most keywords :", most_keywords)

Can't write : ['a0NvBXQ', 'Michiroみちろ', 1, 'michiroみちろ']
Can't write : ['a2WEYQY', 'Hii❤', 1, 'hii']
Can't write : ['a7WvRDr', 'Kĕniæl, the Overseer', 2, 'overseer', 'kĕniæl']
Can't write : ['aBm9Adx', "ಠ_ಠ...When I read this news.I don't think they're UFOs...ಠ_ಠ", 5, 'ಠ_ಠ', 'ufo', 'read', 'news', 'think']
Can't write : ['aD4RNr9', 'Friends furever ♡', 2, 'furever', 'friend']
Can't write : ['aGdKm87', 'ステイホーム', 1, 'ステイホーム']
Can't write : ['aj9Qnr8', '¯\\_(ツ)_/¯ Well', 1, 'well']
Can't write : ['an4ePOL', 'Well ¯\\_(ツ)_/¯', 1, 'well']
Can't write : ['aNgm7Qv', 'Quarintine ️', 1, 'quarintine']
Can't write : ['aqn40Lp', 'The new suprime leader is kinda cute ❤', 5, 'suprime', 'kinda', 'cute', 'leader', 'new']
Can't write : ['aXgZ6dD', 'ᵒʰ ⁿᵒ', 2, 'ⁿᵒ', 'ᵒʰ']
Can't write : ['aYyj0qN', 'The best of friends ♡', 2, 'friend', 'best']
Most keywords : 29


In [197]:
df_titles['title']

0                                            Hat trick
1                                       What happened?
2                           Do you miss the Neinties ?
3                              A different perspective
4              One of the worst moments of pc building
                             ...                      
6033           Doomguy teaches Isabelle his techniques
6034                                   Clear lemon pie
6035    This entire thing was made with color pencils!
6036                       Quarantine hacks gone wrong
6037                     Dunder-Mifflin this is Brazil
Name: title, Length: 6038, dtype: object