# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from tqdm.auto import tqdm
tqdm.pandas()
from IPython.display import clear_output

  from pandas import Panel


In [3]:
pd.options.display.max_colwidth = -1

In [4]:
import json

# Data

In [5]:
data = pd.read_csv('E:/Scrapped-Data/InshortsScraped.csv')
print(data.shape)

(34539, 8)


In [6]:
data.head()

Unnamed: 0,news,national,sports,world,politics,technology,entertainment,hatke
0,"7 IITs among India's top higher education institutes, IIT Madras tops",1,0,0,0,0,0,0
1,They forget she's 14: Devgn on trolls criticising daughter's dress,0,0,0,0,0,1,0
2,Haven't earned so much that I can wear new clothes every day: Janhvi,0,0,0,0,0,1,0
3,Fans in US sing Desi Girl for Priyanka outside Jonas Brothers concert,0,0,0,0,0,1,0
4,"Dhoni bhai was very angry, he hugged me after the match: Chahar",0,1,0,0,0,0,0


In [7]:
data.describe()

Unnamed: 0,national,sports,world,politics,technology,entertainment,hatke
count,34539.0,34539.0,34539.0,34539.0,34539.0,34539.0,34539.0
mean,0.235068,0.170763,0.145227,0.141434,0.22311,0.146472,0.007933
std,0.424047,0.376308,0.352335,0.348474,0.416338,0.353584,0.088715
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Data Preprocessing

## Cleaning

In [8]:
import nltk, re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

def spell_correct(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\0s", "0", text)    
    return text

def remove_url(text):
    URL_REGEX = re.compile(r'''((http[s]?://)[^ <>'"{}|\\^`[\]]*)''')
    return URL_REGEX.sub(r' ', text)

remove_punc = lambda x : re.sub(r"\W", ' ', x)

remove_num = lambda x : re.sub(r"\d", ' ', x)

remove_extra_spaces = lambda x : re.sub(r"\s+", ' ', x)

lower_case = lambda x : x.lower()

with open('stopwords.txt') as f:
    sw = map(lambda x : x.strip(), f.readlines())
stop_words = set(nltk.corpus.stopwords.words('english'))|set(sw)
remove_stopwords = lambda x: ' '.join(word for word in x.split() if word not in stop_words)

ps = PorterStemmer()
ps_stem = lambda x: ' '.join(ps.stem(word) for word in x.split())

wnl = WordNetLemmatizer()
wnl_lemmatize = lambda x: ' '.join(wnl.lemmatize(word) for word in x.split())

def tag_pos(x):
    tag_list =  nltk.pos_tag(nltk.word_tokenize(x))
    pos = ""
    for t in tag_list:
        pos += t[0] +'(' + t[1] +')' + ' '
    return pos

def cleanText(x, rsw, stm, lem, tgps):
    x = str(x)
    x = remove_url(x)
    x = lower_case(x)
    x = spell_correct(x)
    x = remove_punc(x)
    x = remove_num(x)
    x = remove_extra_spaces(x)
    
    if rsw:
        x = remove_stopwords(x)
    if stm:
        x = ps_stem(x)
    if lem:
        x = wnl_lemmatize(x)
    if tgps:
        x = tag_pos(x)
    return x

In [9]:
data['news_c'] = data['news'].progress_apply(lambda x : cleanText(x, True, False, False, False))

HBox(children=(IntProgress(value=0, max=34539), HTML(value='')))




## Association Rules

In [10]:
labels = ['national', 'sports', 'world', 'politics', 'technology', 'entertainment', 'hatke']

corpus = []
for c in labels:
    corpus.append(' '.join(data[data[c] == 1]['news_c']))
    
corpus = pd.Series(corpus)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
        
    results= {}
    
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

def extractKeyWords(corpus, K, i):
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    vectorizer.fit(corpus)
    feature_names = vectorizer.get_feature_names()
 
    doc=corpus[i]
    tf_idf_vector = vectorizer.transform([doc])

    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords=extract_topn_from_vector(feature_names,sorted_items,K)
    
    return(list(keywords.keys()))

In [12]:
feats = {}
for i, l in tqdm(enumerate(labels)):
    feats[l] = (extractKeyWords(corpus, 150, i))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [13]:
for k, v in feats.items():
    print(k, sorted(v), len(v))

national ['aap', 'accused', 'act', 'air', 'allegedly', 'amid', 'amit', 'amit shah', 'andhra', 'anti', 'army', 'arrested', 'article', 'asks', 'assam', 'assembly', 'attack', 'ayodhya', 'bagh', 'bihar', 'bjp', 'bjp mla', 'booked', 'caa', 'car', 'case', 'cases', 'cbi', 'centre', 'chhattisgarh', 'chidambaram', 'chief', 'citizenship', 'cm', 'cong', 'congress', 'coronavirus', 'court', 'cr', 'crore', 'cyclone', 'day', 'days', 'dead', 'death', 'delhi', 'dies', 'ec', 'ex', 'family', 'free', 'gandhi', 'gets', 'girl', 'govt', 'gujarat', 'haryana', 'hc', 'held', 'home', 'iaf', 'india', 'indian', 'injured', 'jnu', 'karnataka', 'kashmir', 'kejriwal', 'kerala', 'killed', 'lakh', 'leader', 'lok', 'lok sabha', 'maha', 'maharashtra', 'mamata', 'man', 'mayawati', 'metro', 'minister', 'mla', 'mlas', 'modi', 'mp', 'mumbai', 'murder', 'new', 'notice', 'nrc', 'odisha', 'old', 'owaisi', 'pak', 'pakistan', 'party', 'pawar', 'people', 'plea', 'pm', 'pm modi', 'police', 'poll', 'polls', 'pradesh', 'priyanka', 'pr

entertainment ['aamir', 'actor', 'actors', 'actress', 'aditya', 'akshay', 'akshay kumar', 'ali', 'alia', 'amid', 'ananya', 'anushka', 'arjun', 'award', 'awards', 'away', 'ayushmann', 'best', 'bhumi', 'big', 'bigg', 'bigg boss', 'biopic', 'bollywood', 'boss', 'cancer', 'cannes', 'case', 'cast', 'chhapaak', 'cinema', 'coronavirus', 'covid', 'cr', 'criticised', 'dad', 'daughter', 'day', 'deepika', 'deol', 'director', 'ex', 'family', 'fan', 'fans', 'father', 'feel', 'film', 'film festival', 'films', 'gets', 'got', 'hrithik', 'husband', 'india', 'indian', 'janhvi', 'johar', 'kabir', 'kabir singh', 'kajol', 'kangana', 'kapoor', 'karan', 'karan johar', 'kareena', 'kartik', 'katrina', 'khan', 'know', 'kriti', 'kumar', 'life', 'like', 'look', 'love', 'make', 'man', 'marriage', 'marvel', 'metoo', 'modi', 'mom', 'mumbai', 'new', 'nick', 'old', 'oscar', 'passes', 'passes away', 'people', 'pic', 'pics', 'play', 'pm', 'police', 'post', 'priyanka', 'producer', 'ranbir', 'rangoli', 'ranveer', 'rape', 

In [14]:
from collections import Counter

In [15]:
kws = []
for v in feats.values():
    kws.extend(v)

In [16]:
redundant_kws = []

for k,v in Counter(kws).items():
    if v > 4:
        print(k, v)
        redundant_kws.append(k)

print(len(redundant_kws))        

india 6
man 7
police 5
old 6
yr 5
yr old 5
case 5
ex 6
people 6
coronavirus 5
indian 5
day 6
year 6
video 6
says 7
new 7
gets 5
st 6
time 6
shares 6
20


In [17]:
for rkw in redundant_kws:
    for k in feats.keys():
        try:
            feats[k].remove(rkw)
        except ValueError:
            pass

In [18]:
for k, v in feats.items():
    print(k, len(v))

national 130
sports 134
world 131
politics 139
technology 135
entertainment 130
hatke 135


In [19]:
DIR = "E:/Models/News-Taging-Association-Rules/"
json.dump(feats, open(DIR+'association-rules.json', 'w'))