# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from tqdm.auto import tqdm
tqdm.pandas()
from IPython.display import clear_output

In [3]:
pd.options.display.max_colwidth = -1

# Data

In [4]:
data = pd.read_csv('E:/Scrapped-Data/InshortsScraped.csv')
print(data.shape)

(13573, 8)


In [5]:
data.head()

Unnamed: 0,news,national,sports,world,politics,technology,entertainment,hatke
0,"7 IITs among India's top higher education institutes, IIT Madras tops",1,0,0,0,0,0,0
1,They forget she's 14: Devgn on trolls criticising daughter's dress,0,0,0,0,0,1,0
2,Haven't earned so much that I can wear new clothes every day: Janhvi,0,0,0,0,0,1,0
3,Fans in US sing Desi Girl for Priyanka outside Jonas Brothers concert,0,0,0,0,0,1,0
4,"Dhoni bhai was very angry, he hugged me after the match: Chahar",0,1,0,0,0,0,0


In [6]:
data.describe()

Unnamed: 0,national,sports,world,politics,technology,entertainment,hatke
count,13573.0,13573.0,13573.0,13573.0,13573.0,13573.0,13573.0
mean,0.288588,0.197672,0.119207,0.199882,0.171296,0.120091,0.004936
std,0.453122,0.398258,0.324044,0.399926,0.376781,0.32508,0.070088
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Data Preprocessing

## Cleaning

In [7]:
import nltk, re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import langdetect

def spell_correct(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\0s", "0", text)    
    return text

def remove_url(text):
    URL_REGEX = re.compile(r'''((http[s]?://)[^ <>'"{}|\\^`[\]]*)''')
    return URL_REGEX.sub(r' ', text)

remove_punc = lambda x : re.sub(r"\W", ' ', x)

remove_num = lambda x : re.sub(r"\d", ' ', x)

remove_extra_spaces = lambda x : re.sub(r"\s+", ' ', x)

lower_case = lambda x : x.lower()

with open('stopwords.txt') as f:
    sw = map(lambda x : x.strip(), f.readlines())
stop_words = set(nltk.corpus.stopwords.words('english'))|set(sw)
remove_stopwords = lambda x: ' '.join(word for word in x.split() if word not in stop_words)

ps = PorterStemmer()
ps_stem = lambda x: ' '.join(ps.stem(word) for word in x.split())

wnl = WordNetLemmatizer()
wnl_lemmatize = lambda x: ' '.join(wnl.lemmatize(word) for word in x.split())

def tag_pos(x):
    tag_list =  nltk.pos_tag(nltk.word_tokenize(x))
    pos = ""
    for t in tag_list:
        pos += t[0] +'(' + t[1] +')' + ' '
    return pos

def cleanText(x, rsw, stm, lem, tgps):
    x = str(x)
    x = remove_url(x)
    x = lower_case(x)
    x = spell_correct(x)
    x = remove_punc(x)
    x = remove_num(x)
    x = remove_extra_spaces(x)
    
    if rsw:
        x = remove_stopwords(x)
    if stm:
        x = ps_stem(x)
    if lem:
        x = wnl_lemmatize(x)
    if tgps:
        x = tag_pos(x)
    return x

In [8]:
data['news_c'] = data['news'].apply(lambda x : cleanText(x, True, False, False, False))

## Feature Generation

In [9]:
labels = ['national', 'sports', 'world', 'politics', 'technology', 'entertainment', 'hatke']

corpus = []
for c in labels:
    corpus.append(' '.join(data[data[c] == 1]['news_c']))
    
corpus = pd.Series(corpus)

In [10]:
# Functions
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix

#Function for sorting tf_idf in descending order
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

def extractKeyWords(corpus, K, i):
    # creating a vector of words
    cv=CountVectorizer(ngram_range=(1,2))
    X=cv.fit_transform(corpus)
    
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(X)

    # get feature names
    feature_names=cv.get_feature_names()

    # fetch document for which keywords needs to be extracted
    doc=corpus[i]

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
    
    #sort the tf-idf vectors by descending order of scores

    sorted_items=sort_coo(tf_idf_vector.tocoo())

    keywords=extract_topn_from_vector(feature_names,sorted_items,K)

    # now print the results
    return(list(keywords.keys()))

In [11]:
def topn(x,n):
     return pd.Series(' '.join(data[data[x] == 1]['news_c']).split()).value_counts()[:n].index

In [12]:
feats = []
for i in range(7):
    feats.append(extractKeyWords(corpus, 100, i))
for f in feats:
    print(sorted(f), len(f))

['aap', 'andhra', 'army', 'arrested', 'bihar', 'bjp', 'bjp mla', 'case', 'cbi', 'chhattisgarh', 'chief', 'cji', 'class', 'cm', 'cong', 'congress', 'cr', 'cyclone', 'day', 'dead', 'death', 'delhi', 'dies', 'ec', 'encounter', 'ex', 'family', 'fani', 'gandhi', 'gets', 'girl', 'govt', 'gujarat', 'hc', 'held', 'india', 'indian', 'injured', 'karnataka', 'kejriwal', 'kerala', 'killed', 'lakh', 'leader', 'lok', 'lok sabha', 'maharashtra', 'mamata', 'man', 'mayawati', 'minister', 'mla', 'mlas', 'modi', 'mp', 'mumbai', 'murder', 'new', 'odisha', 'old', 'pak', 'party', 'people', 'pm', 'pm modi', 'police', 'poll', 'polling', 'polls', 'pradesh', 'pragya', 'priyanka', 'punjab', 'rahul', 'rajasthan', 'rape', 'remark', 'report', 'sabha', 'sadhvi', 'sadhvi pragya', 'sc', 'seats', 'shah', 'singh', 'son', 'students', 'taka', 'tmc', 'unnao', 'video', 'vote', 'wb', 'wife', 'woman', 'women', 'year', 'yogi', 'yr', 'yr old'] 100
['afghanistan', 'afridi', 'archer', 'australia', 'ball', 'barcelona', 'bat', 'bat

# Tagging

In [13]:
pred_labels = ['national_', 'sports_', 'world_', 'politics_', 'technology_', 'entertainment_', 'hatke_']

In [14]:
len(feats)

7

In [15]:
def bincount(x, f):
    for _ in x.split():
        if _ in f:
            return 1
    return 0

In [16]:
i = 0
for f in feats:
    print(f'{i+1} / {len(feats)}')
    data[pred_labels[i]] = data['news_c'].apply(lambda x : bincount(x, f))
    i += 1
    clear_output(wait=True)

7 / 7


In [17]:
data

Unnamed: 0,news,national,sports,world,politics,technology,entertainment,hatke,news_c,national_,sports_,world_,politics_,technology_,entertainment_,hatke_
0,"7 IITs among India's top higher education institutes, IIT Madras tops",1,0,0,0,0,0,0,iits india higher education institutes iit madras tops,1,1,1,1,1,1,0
1,They forget she's 14: Devgn on trolls criticising daughter's dress,0,0,0,0,0,1,0,forget devgn trolls criticising daughter dress,0,0,0,0,0,1,0
2,Haven't earned so much that I can wear new clothes every day: Janhvi,0,0,0,0,0,1,0,earned wear new clothes day janhvi,1,1,1,0,1,0,0
3,Fans in US sing Desi Girl for Priyanka outside Jonas Brothers concert,0,0,0,0,0,1,0,fans sing desi girl priyanka outside jonas brothers concert,1,1,0,1,0,1,0
4,"Dhoni bhai was very angry, he hugged me after the match: Chahar",0,1,0,0,0,0,0,dhoni bhai angry hugged match chahar,0,1,0,0,0,0,0
5,Akram criticises Pak players for eating biryani ahead of World Cup,0,1,0,0,0,0,0,akram criticises pak players eating biryani ahead world cup,1,1,1,0,1,0,1
6,"Indian Navy officer risks life, saves man from drowning at Kerala beach",1,0,0,0,0,0,0,indian navy officer risks life saves man drowning kerala beach,1,1,1,0,1,1,1
7,"Prisoners sue US jail over mice, maggots falling on dining tables",0,0,1,0,0,0,0,prisoners sue jail mice maggots falling dining tables,0,0,0,0,0,0,0
8,Female Indian doctor missing in UK since April 3; probe underway,0,0,1,0,0,0,0,female indian doctor missing uk april probe underway,1,1,1,0,1,0,1
9,"Scandal-hit Korean Air Chairman dies at 70, weeks after board ouster",0,0,0,0,0,0,0,scandal hit korean air chairman dies weeks board ouster,1,0,1,0,0,0,0


# Results

In [18]:
from sklearn.metrics import confusion_matrix, classification_report

In [19]:
for i in range(7):
    print(labels[i])
    print(confusion_matrix(data[pred_labels[i]], data[labels[i]]))
    print(classification_report(data[pred_labels[i]], data[labels[i]]))

national
[[4945  585]
 [4711 3332]]
              precision    recall  f1-score   support

           0       0.51      0.89      0.65      5530
           1       0.85      0.41      0.56      8043

   micro avg       0.61      0.61      0.61     13573
   macro avg       0.68      0.65      0.60     13573
weighted avg       0.71      0.61      0.60     13573

sports
[[7974  400]
 [2916 2283]]
              precision    recall  f1-score   support

           0       0.73      0.95      0.83      8374
           1       0.85      0.44      0.58      5199

   micro avg       0.76      0.76      0.76     13573
   macro avg       0.79      0.70      0.70     13573
weighted avg       0.78      0.76      0.73     13573

world
[[6476  181]
 [5479 1437]]
              precision    recall  f1-score   support

           0       0.54      0.97      0.70      6657
           1       0.89      0.21      0.34      6916

   micro avg       0.58      0.58      0.58     13573
   macro avg       0.71  

# Tagging Tweets

In [20]:
tweets = pd.read_csv('E:/TOCT/TOCT.csv')

In [21]:
i = 0
for f in feats:
    print(f'{i+1} / {len(feats)}')
    tweets[pred_labels[i]] = tweets['News'].progress_apply(lambda x : bincount(cleanText(x, True, False, False, False), f))
    i += 1
    clear_output(wait=True)

7 / 7


HBox(children=(IntProgress(value=0, max=32007), HTML(value='')))




In [22]:
tweets[['News']+pred_labels]

Unnamed: 0,News,national_,sports_,world_,politics_,technology_,entertainment_,hatke_
0,"Haha black and brown kids statistically graduate with Bachelor’s degrees less then white kids, everyone knows that… https://t.co/9CvIcj8cxC",0,0,0,0,0,0,1
1,"The NCAA Management Committee canceled five games Tuesday because of bad weather, and at the same time, announced a… https://t.co/4AZb04Ge60",0,1,1,0,0,0,0
2,I hope in my lifetime I get to toast to the downfall of the totally corrupt bullshit organization that is the @NCAA https://t.co/89BQ7qBdVn,0,0,0,0,0,0,0
3,I never wished death even for my enemies.... May you live and feel the pain of every kashmiri that died horrendously #ShusmaSwaraj,1,0,1,0,0,0,0
4,Our fo former external affairs minister Late Shri #shusmaswaraj had slept in deep. May #God God send him to heaven.… https://t.co/TiIfhzD5Ri,1,0,1,1,0,0,0
5,Best people die early\r\nShe was one of the finest leader finest Foreign minister\r\nMam RIP \r\nWe pray for you ❤️… https://t.co/f2Nqtv61JM,1,1,1,1,0,1,1
6,"Okay first of all, how dare Simon send them back there like that only to break one of their hearts!? I really thoug… https://t.co/33KzmHuFyE",0,0,0,1,0,1,0
7,Omg I’m so tired of people advancing on AGT (or any competition show) just cuz they’re military. I love service mem… https://t.co/K5tDgwAMsV,1,0,1,1,0,1,1
8,Crying like a freaking baby at #AGT last two boys both should have gone through over the dancer!!,0,0,0,1,0,1,0
9,I'm so proud that I'm the part of these fandom family 😍🔥\r\nLove you guys soooo much ❤️@MisElizaJane \r\n@WildpipM… https://t.co/pXp5fsbm46,1,0,0,0,0,0,0


# Summing Up Tags

In [23]:
for i in tqdm(range(0, len(tweets)-1, 3)):
    for c in pred_labels:
        if tweets[c][i] + tweets[c][i+1] + tweets[c][i+2] >=2:
            tweets.at[i, c], tweets.at[i+1, c], tweets.at[i+2, c] = 1, 1, 1
        else:
            tweets.at[i, c], tweets.at[i+1, c], tweets.at[i+2, c]  = 0, 0, 0

HBox(children=(IntProgress(value=0, max=10669), HTML(value='')))




In [24]:
tweets[['News']+pred_labels]

Unnamed: 0,News,national_,sports_,world_,politics_,technology_,entertainment_,hatke_
0,"Haha black and brown kids statistically graduate with Bachelor’s degrees less then white kids, everyone knows that… https://t.co/9CvIcj8cxC",0,0,0,0,0,0,0
1,"The NCAA Management Committee canceled five games Tuesday because of bad weather, and at the same time, announced a… https://t.co/4AZb04Ge60",0,0,0,0,0,0,0
2,I hope in my lifetime I get to toast to the downfall of the totally corrupt bullshit organization that is the @NCAA https://t.co/89BQ7qBdVn,0,0,0,0,0,0,0
3,I never wished death even for my enemies.... May you live and feel the pain of every kashmiri that died horrendously #ShusmaSwaraj,1,0,1,1,0,0,0
4,Our fo former external affairs minister Late Shri #shusmaswaraj had slept in deep. May #God God send him to heaven.… https://t.co/TiIfhzD5Ri,1,0,1,1,0,0,0
5,Best people die early\r\nShe was one of the finest leader finest Foreign minister\r\nMam RIP \r\nWe pray for you ❤️… https://t.co/f2Nqtv61JM,1,0,1,1,0,0,0
6,"Okay first of all, how dare Simon send them back there like that only to break one of their hearts!? I really thoug… https://t.co/33KzmHuFyE",0,0,0,1,0,1,0
7,Omg I’m so tired of people advancing on AGT (or any competition show) just cuz they’re military. I love service mem… https://t.co/K5tDgwAMsV,0,0,0,1,0,1,0
8,Crying like a freaking baby at #AGT last two boys both should have gone through over the dancer!!,0,0,0,1,0,1,0
9,I'm so proud that I'm the part of these fandom family 😍🔥\r\nLove you guys soooo much ❤️@MisElizaJane \r\n@WildpipM… https://t.co/pXp5fsbm46,1,0,0,0,0,1,0
