# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from tqdm.auto import tqdm

In [3]:
from wordcloud import WordCloud
from PIL import Image

In [4]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import clear_output

# Data

In [5]:
data = pd.read_csv('E:/Scrapped-Data/InshortsScraped.csv')
print(data.shape)

(13573, 8)


In [6]:
data.head()

Unnamed: 0,news,national,sports,world,politics,technology,entertainment,hatke
0,7 IITs among India's top higher education inst...,1,0,0,0,0,0,0
1,They forget she's 14: Devgn on trolls criticis...,0,0,0,0,0,1,0
2,Haven't earned so much that I can wear new clo...,0,0,0,0,0,1,0
3,Fans in US sing Desi Girl for Priyanka outside...,0,0,0,0,0,1,0
4,"Dhoni bhai was very angry, he hugged me after ...",0,1,0,0,0,0,0


In [7]:
data.describe()

Unnamed: 0,national,sports,world,politics,technology,entertainment,hatke
count,13573.0,13573.0,13573.0,13573.0,13573.0,13573.0,13573.0
mean,0.288588,0.197672,0.119207,0.199882,0.171296,0.120091,0.004936
std,0.453122,0.398258,0.324044,0.399926,0.376781,0.32508,0.070088
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Data Preprocessing

## Cleaning

In [8]:
import nltk, re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import langdetect

def spell_correct(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\0s", "0", text)    
    return text

def remove_url(text):
    URL_REGEX = re.compile(r'''((http[s]?://)[^ <>'"{}|\\^`[\]]*)''')
    return URL_REGEX.sub(r' ', text)

remove_punc = lambda x : re.sub(r"\W", ' ', x)

remove_num = lambda x : re.sub(r"\d", ' ', x)

remove_extra_spaces = lambda x : re.sub(r"\s+", ' ', x)

lower_case = lambda x : x.lower()

with open('stopwords.txt') as f:
    sw = map(lambda x : x.strip(), f.readlines())
stop_words = set(nltk.corpus.stopwords.words('english'))|set(sw)
remove_stopwords = lambda x: ' '.join(word for word in x.split() if word not in stop_words)

ps = PorterStemmer()
ps_stem = lambda x: ' '.join(ps.stem(word) for word in x.split())

wnl = WordNetLemmatizer()
wnl_lemmatize = lambda x: ' '.join(wnl.lemmatize(word) for word in x.split())

def tag_pos(x):
    tag_list =  nltk.pos_tag(nltk.word_tokenize(x))
    pos = ""
    for t in tag_list:
        pos += t[0] +'(' + t[1] +')' + ' '
    return pos

def cleanText(x, rsw, stm, lem, tgps):
    x = str(x)
    x = remove_url(x)
    x = lower_case(x)
    x = spell_correct(x)
    x = remove_punc(x)
    x = remove_num(x)
    x = remove_extra_spaces(x)
    
    if rsw:
        x = remove_stopwords(x)
    if stm:
        x = ps_stem(x)
    if lem:
        x = wnl_lemmatize(x)
    if tgps:
        x = tag_pos(x)
    return x

In [9]:
data['news_c'] = data['news'].apply(lambda x : cleanText(x, True, False, False, False))

## Feature Generation

In [17]:
labels = ['national', 'sports', 'world', 'politics', 'technology', 'entertainment', 'hatke']
pred_labels = ['national_', 'sports_', 'world_', 'politics_', 'technology_', 'entertainment_', 'hatke_']

corpus = []
for c in labels:
    corpus.append(' '.join(data[data[c] == 1]['news_c']))
    
corpus = pd.Series(corpus)

In [18]:
# Functions
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix

#Function for sorting tf_idf in descending order
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

def extractKeyWords(corpus, K, i):
    # creating a vector of words
    cv=CountVectorizer(ngram_range=(1,2))
    X=cv.fit_transform(corpus)
    
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(X)

    # get feature names
    feature_names=cv.get_feature_names()

    # fetch document for which keywords needs to be extracted
    doc=corpus[i]

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
    
    #sort the tf-idf vectors by descending order of scores

    sorted_items=sort_coo(tf_idf_vector.tocoo())

    keywords=extract_topn_from_vector(feature_names,sorted_items,K)

    # now print the results
    return(list(keywords.keys()))

In [19]:
def topn(x,n):
     return pd.Series(' '.join(data[data[x] == 1]['news_c']).split()).value_counts()[:n].index

In [20]:
feats = []
for i in range(7):
    feats.append(extractKeyWords(corpus, 100, i))
for f in feats:
    print(f, len(f))

['pm', 'bjp', 'modi', 'cm', 'pm modi', 'india', 'delhi', 'govt', 'man', 'old', 'rahul', 'congress', 'killed', 'police', 'sc', 'yr', 'cong', 'mp', 'woman', 'ec', 'yr old', 'mla', 'mumbai', 'gandhi', 'bihar', 'leader', 'mamata', 'case', 'arrested', 'chief', 'sabha', 'held', 'odisha', 'minister', 'lakh', 'pak', 'dead', 'ex', 'death', 'polls', 'people', 'andhra', 'cji', 'indian', 'kerala', 'kejriwal', 'lok sabha', 'lok', 'wb', 'vote', 'girl', 'poll', 'pradesh', 'shah', 'priyanka', 'son', 'rape', 'mayawati', 'taka', 'tmc', 'year', 'wife', 'video', 'report', 'pragya', 'fani', 'karnataka', 'yogi', 'sadhvi', 'injured', 'dies', 'cr', 'army', 'seats', 'chhattisgarh', 'day', 'bjp mla', 'gujarat', 'remark', 'mlas', 'cyclone', 'party', 'family', 'maharashtra', 'encounter', 'new', 'aap', 'class', 'singh', 'punjab', 'unnao', 'women', 'murder', 'gets', 'rajasthan', 'students', 'hc', 'sadhvi pragya', 'cbi', 'polling'] 100
['cup', 'india', 'world cup', 'world', 'wc', 'kohli', 'dhoni', 'match', 'ipl', 'w

In [14]:
# feats = [item for sublist in feats for item in sublist]

In [21]:
len(feats)

7

In [22]:
# from nltk.corpus import wordnet

# def synonyms(x):
#     synonyms_list = [x]
#     for syn in wordnet.synsets(x):
#         for s in syn.lemmas():
#             synonyms_list.append(s.name())
#     return set(synonyms_list)

def bincount(x, f):
    for _ in x.split():
        if _ in f:
            return 1
    return 0

In [23]:
i = 0
for f in feats:
    print(f'{i+1} / {len(feats)}')
    data[pred_labels[i]] = data['news_c'].apply(lambda x : bincount(x, f))
    i += 1
    clear_output(wait=True)

6 / 7


In [24]:
data

Unnamed: 0,news,national,sports,world,politics,technology,entertainment,hatke,news_c,national_,sports_,world_,politics_,technology_,entertainment_,hatke_
0,7 IITs among India's top higher education inst...,1,0,0,0,0,0,0,iits india higher education institutes iit mad...,1,1,1,1,1,1,0
1,They forget she's 14: Devgn on trolls criticis...,0,0,0,0,0,1,0,forget devgn trolls criticising daughter dress,0,0,0,0,0,1,0
2,Haven't earned so much that I can wear new clo...,0,0,0,0,0,1,0,earned wear new clothes day janhvi,1,1,1,0,1,0,0
3,Fans in US sing Desi Girl for Priyanka outside...,0,0,0,0,0,1,0,fans sing desi girl priyanka outside jonas bro...,1,1,0,1,0,1,0
4,"Dhoni bhai was very angry, he hugged me after ...",0,1,0,0,0,0,0,dhoni bhai angry hugged match chahar,0,1,0,0,0,0,0
5,Akram criticises Pak players for eating biryan...,0,1,0,0,0,0,0,akram criticises pak players eating biryani ah...,1,1,1,0,1,0,1
6,"Indian Navy officer risks life, saves man from...",1,0,0,0,0,0,0,indian navy officer risks life saves man drown...,1,1,1,0,1,1,1
7,"Prisoners sue US jail over mice, maggots falli...",0,0,1,0,0,0,0,prisoners sue jail mice maggots falling dining...,0,0,0,0,0,0,0
8,Female Indian doctor missing in UK since April...,0,0,1,0,0,0,0,female indian doctor missing uk april probe un...,1,1,1,0,1,0,1
9,"Scandal-hit Korean Air Chairman dies at 70, we...",0,0,0,0,0,0,0,scandal hit korean air chairman dies weeks boa...,1,0,1,0,0,0,0


In [28]:
from sklearn.metrics import confusion_matrix, classification_report

In [29]:
for i in range(7):
    print(labels[i])
    print(confusion_matrix(data[pred_labels[i]], data[labels[i]]))
    print(classification_report(data[pred_labels[i]], data[labels[i]]))

national
[[4945  585]
 [4711 3332]]
              precision    recall  f1-score   support

           0       0.51      0.89      0.65      5530
           1       0.85      0.41      0.56      8043

   micro avg       0.61      0.61      0.61     13573
   macro avg       0.68      0.65      0.60     13573
weighted avg       0.71      0.61      0.60     13573

sports
[[7974  400]
 [2916 2283]]
              precision    recall  f1-score   support

           0       0.73      0.95      0.83      8374
           1       0.85      0.44      0.58      5199

   micro avg       0.76      0.76      0.76     13573
   macro avg       0.79      0.70      0.70     13573
weighted avg       0.78      0.76      0.73     13573

world
[[6476  181]
 [5479 1437]]
              precision    recall  f1-score   support

           0       0.54      0.97      0.70      6657
           1       0.89      0.21      0.34      6916

   micro avg       0.58      0.58      0.58     13573
   macro avg       0.71  

In [34]:
def tag(tweet):
    tweet = cleanText(tweet, True, False, False, False)
    tags = []
    for i in range(7):
        if bincount(tweet, feats[i]):
            tags.append(labels[i])
    return tags

In [35]:
tweets = pd.read_csv('E:/TOCT/TOCT.csv')

In [38]:
tqdm.pandas()
tweets['Tags'] = tweets['News'].progress_apply(lambda x: tag(x))

HBox(children=(IntProgress(value=0, max=31934), HTML(value='')))

In [39]:
tweets['Tags'].value_counts()

[]                                                                       9832
[sports]                                                                 3124
[entertainment]                                                          1665
[national, sports, world, politics, technology, entertainment]           1427
[national, sports, world, technology]                                    1055
[sports, world]                                                           859
[politics, entertainment]                                                 788
[national, politics]                                                      732
[national]                                                                722
[national, world, politics, entertainment, hatke]                         692
[sports, entertainment]                                                   650
[sports, world, technology]                                               647
[national, world, politics, entertainment]                      