# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from tqdm.auto import tqdm
tqdm.pandas()
from IPython.display import clear_output

  from pandas import Panel


In [3]:
pd.options.display.max_colwidth = -1

In [4]:
import json

# Data

In [5]:
data = pd.read_csv('E:/Scrapped-Data/InshortsScraped.csv')
print(data.shape)

(41214, 8)


In [6]:
data.head()

Unnamed: 0,news,national,sports,world,politics,technology,entertainment,hatke
0,"7 IITs among India's top higher education institutes, IIT Madras tops",1,0,0,0,0,0,0
1,They forget she's 14: Devgn on trolls criticising daughter's dress,0,0,0,0,0,1,0
2,Haven't earned so much that I can wear new clothes every day: Janhvi,0,0,0,0,0,1,0
3,Fans in US sing Desi Girl for Priyanka outside Jonas Brothers concert,0,0,0,0,0,1,0
4,"Dhoni bhai was very angry, he hugged me after the match: Chahar",0,1,0,0,0,0,0


In [7]:
data.describe()

Unnamed: 0,national,sports,world,politics,technology,entertainment,hatke
count,41214.0,41214.0,41214.0,41214.0,41214.0,41214.0,41214.0
mean,0.233998,0.162663,0.156791,0.124715,0.225312,0.154753,0.007546
std,0.423376,0.369063,0.363608,0.330399,0.417793,0.361674,0.08654
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Data Preprocessing

## Cleaning

In [8]:
import nltk, re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

def spell_correct(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\0s", "0", text)    
    return text

def remove_url(text):
    URL_REGEX = re.compile(r'''((http[s]?://)[^ <>'"{}|\\^`[\]]*)''')
    return URL_REGEX.sub(r' ', text)

remove_punc = lambda x : re.sub(r"\W", ' ', x)

remove_num = lambda x : re.sub(r"\d", ' ', x)

remove_extra_spaces = lambda x : re.sub(r"\s+", ' ', x)

lower_case = lambda x : x.lower()

with open('stopwords.txt') as f:
    sw = map(lambda x : x.strip(), f.readlines())
stop_words = set(nltk.corpus.stopwords.words('english'))|set(sw)
remove_stopwords = lambda x: ' '.join(word for word in x.split() if word not in stop_words)

ps = PorterStemmer()
ps_stem = lambda x: ' '.join(ps.stem(word) for word in x.split())

wnl = WordNetLemmatizer()
wnl_lemmatize = lambda x: ' '.join(wnl.lemmatize(word) for word in x.split())

def tag_pos(x):
    tag_list =  nltk.pos_tag(nltk.word_tokenize(x))
    pos = ""
    for t in tag_list:
        pos += t[0] +'(' + t[1] +')' + ' '
    return pos

def cleanText(x, rsw, stm, lem, tgps):
    x = str(x)
    x = remove_url(x)
    x = lower_case(x)
    x = spell_correct(x)
    x = remove_punc(x)
    x = remove_num(x)
    x = remove_extra_spaces(x)
    
    if rsw:
        x = remove_stopwords(x)
    if stm:
        x = ps_stem(x)
    if lem:
        x = wnl_lemmatize(x)
    if tgps:
        x = tag_pos(x)
    return x

In [9]:
data['news_c'] = data['news'].progress_apply(lambda x : cleanText(x, True, False, False, False))

HBox(children=(IntProgress(value=0, max=41214), HTML(value='')))




In [10]:
from collections import Counter

In [11]:
word_freqs = Counter((' '.join(data['news_c'].values)).split())

In [12]:
print(' 1st quantile', np.quantile(list(word_freqs.values()), 0.0010))
print('99th quantile', np.quantile(list(word_freqs.values()), 0.9990))

 1st quantile 1.0
99th quantile 702.6710000000166


In [13]:
len(word_freqs)

25334

## Association Rules

In [14]:
sorted(word_freqs.items(), key=lambda x: x[1])

[('suppressing', 1),
 ('rollins', 1),
 ('muguruza', 1),
 ('monterrey', 1),
 ('supersession', 1),
 ('hanush', 1),
 ('cripple', 1),
 ('windscreen', 1),
 ('apprentice', 1),
 ('deulofeu', 1),
 ('curling', 1),
 ('whipping', 1),
 ('hysteria', 1),
 ('peacekeeping', 1),
 ('daycare', 1),
 ('slovakia', 1),
 ('anthems', 1),
 ('kirstjen', 1),
 ('homeland', 1),
 ('eager', 1),
 ('statute', 1),
 ('hysterical', 1),
 ('slabs', 1),
 ('afely', 1),
 ('healthvault', 1),
 ('tories', 1),
 ('cashify', 1),
 ('undervaluing', 1),
 ('television', 1),
 ('aashka', 1),
 ('horlicks', 1),
 ('reconnects', 1),
 ('nuances', 1),
 ('deconstruct', 1),
 ('royale', 1),
 ('parfum', 1),
 ('septuplets', 1),
 ('serber', 1),
 ('malkiel', 1),
 ('trashtag', 1),
 ('loneliest', 1),
 ('mermaid', 1),
 ('chuddies', 1),
 ('overdue', 1),
 ('yogurt', 1),
 ('sauna', 1),
 ('aerodynamics', 1),
 ('peed', 1),
 ('rattlesnakes', 1),
 ('handbag', 1),
 ('windowless', 1),
 ('mimicking', 1),
 ('smuggler', 1),
 ('sheraton', 1),
 ('chlorophyll', 1),
 ('

In [15]:
labels = ['national', 'sports', 'world', 'politics', 'technology', 'entertainment', 'hatke']

corpus = []
for c in labels:
    corpus.append(' '.join(data[data[c] == 1]['news_c']))
    
corpus = pd.Series(corpus)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    threshold = np.quantile(list(map(lambda x: x[1], sorted_items)), (100-topn)/100)

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
        
    results= {}
    
    
    for idx in range(len(feature_vals)):
        if score_vals[idx] < threshold:
            break
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

def extractKeyWords(corpus, K, i):
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    vectorizer.fit(corpus)
    feature_names = vectorizer.get_feature_names()
 
    doc=corpus[i]
    tf_idf_vector = vectorizer.transform([doc])

    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords=extract_topn_from_vector(feature_names,sorted_items,K)
    
    return(list(keywords.keys()))

In [17]:
feats = {}
for i, l in tqdm(enumerate(labels)):
    feats[l] = (extractKeyWords(corpus, 1, i))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [18]:
for k, v in feats.items():
    print(k, len(v))
    print(sorted(v))
    print('\n\n')

national 693
['aap', 'aayog', 'accident', 'accused', 'act', 'action', 'active cases', 'aes', 'ahead', 'aiims', 'air', 'air india', 'aircraft', 'airport', 'ajit', 'akhilesh', 'alert', 'allegedly', 'alliance', 'allowed', 'allows', 'ambulance', 'amendment', 'amethi', 'amid', 'amid lockdown', 'amit', 'amit shah', 'andhra', 'andhra cm', 'andhra pradesh', 'announces', 'anti', 'anti caa', 'april', 'army', 'army chief', 'arrest', 'arrested', 'article', 'arunachal', 'arvind kejriwal', 'asked', 'asks', 'assam', 'assembly', 'attack', 'attacked', 'away', 'ayodhya', 'azam', 'azam khan', 'baby', 'bagh', 'bail', 'ban', 'banerjee', 'bank', 'bans', 'beaten', 'bengal', 'bengaluru', 'biggest', 'bihar', 'birth', 'bjp', 'bjp chief', 'bjp leader', 'bjp mla', 'bjp mp', 'bjp workers', 'board', 'body', 'booked', 'booth', 'border', 'boy', 'bsf', 'bsp', 'budget', 'building', 'bus', 'caa', 'cab', 'cabinet', 'calling', 'calls', 'campaigning', 'candidate', 'candidates', 'car', 'carrying', 'case', 'cases', 'cases cr

['abd', 'afghanistan', 'africa', 'afridi', 'ahead', 'ajinkya', 'ajinkya rahane', 'akhtar', 'akram', 'ali', 'amid', 'amir', 'anderson', 'anushka', 'archer', 'arsenal', 'ashes', 'ashwin', 'asian', 'asks', 'atp', 'aus', 'australia', 'award', 'away', 'azharuddin', 'babar', 'babar azam', 'bairstow', 'ball', 'balls', 'ban', 'bangladesh', 'banned', 'barca', 'barcelona', 'bat', 'batsman', 'batsmen', 'batting', 'bcci', 'beat', 'beats', 'ben stokes', 'best', 'better', 'bhogle', 'bhuvneshwar', 'big', 'boult', 'bowl', 'bowler', 'bowlers', 'bowling', 'boxer', 'breaks', 'brian lara', 'bumrah', 'buttler', 'calls', 'captain', 'captaincy', 'career', 'catch', 'chahal', 'chahar', 'champ', 'championship', 'championships', 'chappell', 'chelsea', 'chhetri', 'chief', 'city', 'cl', 'club', 'coach', 'copa', 'coronavirus', 'covid', 'cricket', 'cricketer', 'cricketers', 'criticised', 'criticises', 'crore', 'csk', 'cup', 'cup match', 'cwg', 'daughter', 'david', 'david warner', 'day', 'day night', 'days', 'debut',

['afghan', 'afghanistan', 'africa', 'aid', 'air', 'airspace', 'amazon', 'american', 'americans', 'amid', 'amid coronavirus', 'amid covid', 'anti', 'apologises', 'apple', 'approves', 'arabia', 'army', 'arrested', 'asks', 'assange', 'attack', 'attacks', 'aus', 'australia', 'australian', 'away', 'baby', 'baghdadi', 'ban', 'bangladesh', 'bans', 'base', 'biden', 'biggest', 'billion', 'black', 'black man', 'blast', 'blasts', 'bn', 'bomb', 'border', 'boris', 'boris johnson', 'boy', 'brazil', 'brexit', 'britain', 'bushfires', 'calls', 'canada', 'car', 'carrying', 'case', 'cases', 'cases cross', 'cases exceed', 'ceasefire', 'ceo', 'charged', 'chief', 'children', 'china', 'chinese', 'citizens', 'city', 'claim', 'claims', 'climate', 'commander', 'coronavirus', 'coronavirus cases', 'coronavirus death', 'coronavirus deaths', 'coronavirus outbreak', 'coronavirus pandemic', 'coronavirus trump', 'countries', 'country', 'court', 'covid', 'covid cases', 'covid deaths', 'covid pandemic', 'crash', 'crashe

['aaditya', 'aap', 'aap mla', 'abdullah', 'act', 'adhir', 'ajit', 'ajit pawar', 'akhilesh', 'alliance', 'amethi', 'amid', 'amit', 'amit shah', 'andhra', 'andhra cm', 'anti', 'arrest', 'article', 'arvind kejriwal', 'asks', 'assam', 'assembly', 'assembly polls', 'attack', 'azam', 'azam khan', 'bagh', 'bail', 'ban', 'banerjee', 'bengal', 'bihar', 'bjp', 'bjp chief', 'bjp govt', 'bjp leader', 'bjp mla', 'bjp mp', 'bjp worker', 'bjp workers', 'booked', 'booth', 'bsp', 'bypolls', 'caa', 'cabinet', 'calls', 'campaign', 'campaigning', 'candidate', 'candidates', 'case', 'cbi', 'centre', 'chandrababu', 'chhattisgarh', 'chidambaram', 'chief', 'chowkidar', 'citizenship', 'claim', 'cm', 'cong', 'cong chief', 'cong leader', 'cong mla', 'congress', 'congress leader', 'contest', 'country', 'court', 'covid', 'cpi', 'cr', 'crore', 'day', 'days', 'decision', 'defeat', 'delhi', 'delhi bjp', 'delhi cm', 'delhi polls', 'demands', 'democracy', 'deol', 'detained', 'digvijaya', 'dikshit', 'ec', 'ec notice', 'e

['aarogya', 'aarogya setu', 'access', 'account', 'accounts', 'accused', 'acquire', 'acquires', 'acquisition', 'add', 'adds', 'admits', 'ads', 'ago', 'agritech', 'agritech startup', 'ai', 'ai startup', 'aims', 'air', 'airbnb', 'airtel', 'alexa', 'alibaba', 'alleged', 'alphabet', 'amazon', 'amazon flipkart', 'amid', 'amid coronavirus', 'amid covid', 'analytics', 'ancient', 'android', 'angel', 'announces', 'anti', 'apologises', 'app', 'app store', 'apple', 'apple ceo', 'apple google', 'appoints', 'apps', 'ar', 'artificial', 'asks', 'asteroid', 'astronaut', 'astronauts', 'astronomers', 'australia', 'auto', 'autopilot', 'away', 'backed', 'bacteria', 'ban', 'bank', 'bans', 'bansal', 'based', 'battery', 'bengaluru', 'bezos', 'big', 'bigbasket', 'billion', 'billionaire', 'binny', 'binny bansal', 'bitcoin', 'black', 'black hole', 'bn', 'board', 'body', 'brain', 'brand', 'breach', 'bug', 'build', 'business', 'buy', 'buys', 'byju', 'bytedance', 'cait', 'california', 'calls', 'camera', 'cancer', '

['aamir', 'aand', 'aankh', 'accused', 'acid', 'acid attack', 'acting', 'action', 'actor', 'actors', 'actress', 'actresses', 'aditya', 'adnan', 'age', 'aged', 'ajay', 'akshay', 'akshay kumar', 'ali', 'ali khan', 'alia', 'amid', 'amid coronavirus', 'amid covid', 'amid lockdown', 'amitabh', 'amitabh bachchan', 'ananya', 'angelina', 'angrezi', 'anil', 'anniversary', 'anu', 'anupam', 'anurag', 'anushka', 'apologises', 'arjun', 'asked', 'asks', 'attack', 'attend', 'audience', 'avengers', 'award', 'awards', 'away', 'ay', 'ayushmann', 'baby', 'bachchan', 'bad', 'bala', 'batman', 'best', 'bharat', 'bhumi', 'big', 'bigg', 'bigg boss', 'biopic', 'bjp', 'body', 'bollywood', 'bond', 'boney', 'boss', 'box office', 'boy', 'boyfriend', 'brad', 'brother', 'caa', 'called', 'calls', 'cancer', 'cannes', 'cannes film', 'car', 'career', 'carpet', 'case', 'cast', 'casting', 'celebs', 'change', 'character', 'chhapaak', 'child', 'chopra', 'chris', 'cinema', 'claims', 'come', 'comedian', 'comedy', 'complaint', 

In [19]:
from collections import Counter

In [20]:
kws = []
for v in feats.values():
    kws.extend(v)

In [21]:
redundant_kws = []

for k,v in Counter(kws).items():
    if v > 4:
        redundant_kws.append(k)

print(len(redundant_kws), redundant_kws)        

63 ['pm', 'delhi', 'india', 'govt', 'coronavirus', 'man', 'police', 'covid', 'old', 'woman', 'yr', 'pak', 'case', 'yr old', 'people', 'new', 'day', 'chief', 'ex', 'indian', 'lakh', 'lockdown', 'amid', 'year', 'video', 'dies', 'says', 'reports', 'home', 'days', 'report', 'st', 'gets', 'son', 'family', 'time', 'year old', 'wife', 'crore', 'women', 'test', 'asks', 'wo', 'shares', 'yrs', 'th', 'media', 'like', 'world', 'years', 'calls', 'away', 'post', 'ban', 'viral', 'khan', 'months', 'work', 'tests', 'make', 'pic', 'used', 'pics']


In [22]:
for rkw in redundant_kws:
    for k in feats.keys():
        try:
            feats[k].remove(rkw)
        except ValueError:
            pass

In [23]:
for k, v in feats.items():
    print(k, len(v))

national 630
sports 414
world 379
politics 316
technology 620
entertainment 412
hatke 18


In [24]:
DIR = "E:/Models/News-Taging-Association-Rules/"
json.dump(feats, open(DIR+'association-rules.json', 'w'))