In [None]:
import pandas as pd

df_idf=pd.read_csv("Output_BBC.csv")
print("Schema:\n\n",df_idf.dtypes)
print("Number of questions,columns=",df_idf.shape)


Schema:

 Link           object
Title          object
Description    object
dtype: object
Number of questions,columns= (1070, 3)


In [None]:
import re
def pre_process(text):
    
    text=text.lower()
    text=re.sub("</?.*?>"," <> ",text)
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

df_idf['text'] = df_idf['Title'].astype(str) + df_idf['Description'].astype(str)
df_idf['text'] = df_idf['text'].apply(lambda x:pre_process(x))

df_idf['text'][2]

' we talk to plants they tell us how happy they are vertical farming growing plants indoors on stacked levels in a highly controlled environment allows higher yields show more'

Hmm, doesn't look very pretty with all the html in there, but that's the point. Even in such a mess we can extract some great stuff out of this. While you can eliminate all code from the text, we will keep the code sections for this tutorial for the sake of simplicity.  

## Creating the IDF

### CountVectorizer to create a vocabulary and generate word counts
The next step is to start the counting process. We can use the CountVectorizer to create a vocabulary from all the text in our `df_idf['text']` and generate counts for each row in `df_idf['text']`. The result of the last two lines is a sparse matrix representation of the counts, meaning each column represents a word in the vocabulary and each row represents the document in our dataset where the values are the word counts. Note that with this representation, counts of some words could be 0 if the word did not appear in the corresponding document.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def get_stop_words(stop_file_path):    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

stopwords=get_stop_words("stopwords.txt")

docs=df_idf['text'].tolist()

cv=CountVectorizer(max_df=0.85,stop_words=stopwords)
word_count_vector=cv.fit_transform(docs)

In [None]:
word_count_vector.shape

(1070, 22988)

In [None]:
cv=CountVectorizer(max_df=0.85,stop_words=stopwords,max_features=10000)
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape

(20000, 10000)

In [None]:
list(cv.vocabulary_.keys())[:10]

['pupils',
 'data',
 'spread',
 'online',
 'hereford',
 'school',
 'cyber',
 'attackpupils',
 'attack',
 'published']

We can also get the vocabulary by using `get_feature_names()`

In [None]:
list(cv.get_feature_names())[2000:2015]



['bid',
 'bide',
 'biden',
 'bidenhow',
 'bidens',
 'bidensnihurivka',
 'bids',
 'bien',
 'bigger',
 'biggerstaff',
 'biggest',
 'biginagi',
 'bigots',
 'bigwigs',
 'bigyn']

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

Let's look at some of the IDF values:

In [None]:
tfidf_transformer.idf_

array([7.28320089, 6.59005371, 7.28320089, ..., 7.28320089, 7.28320089,
       6.87773578])

In [None]:
df_test=pd.read_csv("Output_BBC.csv")
df_test['text'] = df_test['Title'].astype(str) + df_test['Description'].astype(str)
df_test['text'] =df_test['text'].apply(lambda x:pre_process(x))

docs_test=df_test['text'].tolist()
docs_title=df_test['Title'].tolist()
docs_body=df_test['Description'].tolist()

In [None]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [None]:
# you only needs to do this once
feature_names=cv.get_feature_names()

# get the document that we want to extract keywords from
doc=docs_test[0]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)

# now print the results
print("\n=====Title=====")
print(docs_title[0])
print("\n=====Body=====")
print(docs_body[0])
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


=====Title=====
Pupils' data spread online in Hereford school cyber attack

=====Body=====
Pupils' data spread online in Hereford school cyber attack
Published
15 hours ago
Share
IMAGE SOURCE,
GETTY IMAGES
Image caption,
Bishop of Hereford's Bluecoat School said it was working with police
A cyber attack at a secondary school led to pupils' information being published online.
West Mercia Police has launched an investigation into the breach on 9 October at Bishop of Hereford's Bluecoat School, in Hampton Dene Road.
The school said it was taking the attack "extremely seriously" and was assisting the force with its inquiries.
"We have been open with our school community," it told BBC Hereford and Worcester.
IMAGE SOURCE,
GOOGLE
Image caption,
The school has said it is assisting police with their investigation into the breach
Supt Ed Williams said: "We're working with the school to establish the information that has been published to ensure any necessary safeguarding measures are put in pl



In [None]:
# put the common code into several methods
def get_keywords(idx):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs_test[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

def print_results(idx,keywords):
    # now print the results
    print("\n=====Title=====")
    print(docs_title[idx])
    print("\n=====Body=====")
    print(docs_body[idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])



In [None]:
idx=120
keywords=get_keywords(idx)
print_results(idx,keywords)


=====Title=====
The islands that want tourists as well as fish

=====Body=====
The islands that want tourists as well as fish
Published
6 August 2020
Share
IMAGE SOURCE,
ADRIENNE MURRAY
Image caption,
The Faroe Islands are home to just 52,000 people
By Adrienne Murray
Business reporter, Faroe Islands
Hundreds of miles from its nearest neighbour, the remote Faroe Islands are surrounded by the Atlantic Ocean. Fishing has always been a way of life, and fish accounts for 90% of all exported goods. But coronavirus is hitting efforts to increase tourism.
The drive to the village of Glyvrar is nothing less than dramatic.
The road from the airport winds past mountains and fjords, and passes through tunnels cut into hillsides and burrowed under the sea.
Glyvrar is home to the Faroe Islands' largest firm, Bakkafrost - which farms salmon.
At its state-of-art plant almost 60,000 tonnes of salmon is processed annually.
IMAGE SOURCE,
GETTY IMAGES
Image caption,
Fish is the country's biggest export


## Generate keywords for a batch of documents

In [None]:
#generate tf-idf for all documents in your list. docs_test has 500 documents
tf_idf_vector=tfidf_transformer.transform(cv.transform(docs_test))

results=[]
for i in range(tf_idf_vector.shape[0]):
    
    # get vector for a single document
    curr_vector=tf_idf_vector[i]
    
    #sort the tf-idf vector by descending order of scores
    sorted_items=sort_coo(curr_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    
    results.append(keywords)

df=pd.DataFrame(zip(docs,results),columns=['doc','keywords'])
df.to_csv('output.csv')