# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from wordcloud import WordCloud
from PIL import Image

In [3]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import clear_output

# Data

In [4]:
data = pd.read_csv('InshortsScraped.csv')
print(data.shape)

(9230, 8)


In [5]:
data.head()

Unnamed: 0,news,national,sports,world,politics,technology,entertainment,hatke
0,7 IITs among India's top higher education inst...,1,0,0,0,0,0,0
1,They forget she's 14: Devgn on trolls criticis...,0,0,0,0,0,1,0
2,Haven't earned so much that I can wear new clo...,0,0,0,0,0,1,0
3,Fans in US sing Desi Girl for Priyanka outside...,0,0,0,0,0,1,0
4,"Dhoni bhai was very angry, he hugged me after ...",0,1,0,0,0,0,0


In [6]:
data.describe()

Unnamed: 0,national,sports,world,politics,technology,entertainment,hatke
count,9230.0,9230.0,9230.0,9230.0,9230.0,9230.0,9230.0
mean,0.295991,0.17779,0.117551,0.219285,0.174648,0.131961,0.006392
std,0.456512,0.382356,0.322094,0.413784,0.379686,0.338467,0.0797
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Data Preprocessing

## Cleaning

In [7]:
import nltk, re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import langdetect

def spell_correct(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\0s", "0", text)    
    return text

def remove_url(text):
    URL_REGEX = re.compile(r'''((http[s]?://)[^ <>'"{}|\\^`[\]]*)''')
    return URL_REGEX.sub(r' ', text)

remove_punc = lambda x : re.sub(r"\W", ' ', x)

remove_num = lambda x : re.sub(r"\d", ' ', x)

remove_extra_spaces = lambda x : re.sub(r"\s+", ' ', x)

lower_case = lambda x : x.lower()

with open('stopwords.txt') as f:
    sw = map(lambda x : x.strip(), f.readlines())
stop_words = set(nltk.corpus.stopwords.words('english'))|set(sw)
remove_stopwords = lambda x: ' '.join(word for word in x.split() if word not in stop_words)

ps = PorterStemmer()
ps_stem = lambda x: ' '.join(ps.stem(word) for word in x.split())

wnl = WordNetLemmatizer()
wnl_lemmatize = lambda x: ' '.join(wnl.lemmatize(word) for word in x.split())

def tag_pos(x):
    tag_list =  nltk.pos_tag(nltk.word_tokenize(x))
    pos = ""
    for t in tag_list:
        pos += t[0] +'(' + t[1] +')' + ' '
    return pos

def cleanText(x, rsw, stm, lem, tgps):
    x = str(x)
    x = remove_url(x)
    x = lower_case(x)
    x = spell_correct(x)
    x = remove_punc(x)
    x = remove_num(x)
    x = remove_extra_spaces(x)
    
    if rsw:
        x = remove_stopwords(x)
    if stm:
        x = ps_stem(x)
    if lem:
        x = wnl_lemmatize(x)
    if tgps:
        x = tag_pos(x)
    return x

In [8]:
data['news_c'] = data['news'].apply(lambda x : cleanText(x, True, False, False, False))

## Feature Generation

In [10]:
labels = ['national', 'sports', 'world', 'politics', 'technology', 'entertainment', 'hatke']

def top100(x):
     return pd.Series(' '.join(data[data[x] == 1]['news_c']).split()).value_counts()[:100].index

feats = set()
for c in labels:
    feats = feats|set(top100(c))
feats = list(feats)
n = len(feats)
print(feats, len(feats))

['pro', 'phones', 'sidhu', 'feel', 'wife', 'inside', 'win', 'smuggler', 'like', 'daughter', 'challenge', 'deepika', 'amit', 'rally', 'using', 'rescued', 'cricketer', 'musk', 'batsman', 'send', 'aap', 'remake', 'woman', 'serial', 'launch', 'st', 'life', 'old', 'taapsee', 'ai', 'madrid', 'biopic', 'bsp', 'crore', 'nz', 'l', 'team', 'dame', 'raises', 'yrs', 'nd', 'based', 'coach', 'criminal', 'n', 'know', 'post', 'ayushmann', 'cm', 'french', 'naidu', 'sadhvi', 'bn', 'csk', 'whatsapp', 'image', 'open', 'car', 'singh', 'nuclear', 'test', 'cji', 'big', 'leak', 'eye', 'sit', 'rival', 'attack', 'tablet', 'arrested', 'breast', 'wedding', 'icc', 'yogurt', 'khan', 'sauna', 'lanka', 'record', 'strong', 'pragya', 'football', 'flipkart', 'ball', 'runs', 'working', 'class', 'odi', 'starrer', 'title', 'baby', 'anatomy', 'held', 'pregnant', 'hit', 'plans', 'yogi', 'makes', 'azhar', 'jail', 'penises', 'time', 'nadal', 'maker', 'people', 'mumbai', 'terror', 'video', 'yuvraj', 'row', 'chicken', 'reddit', 

In [12]:
from nltk.corpus import wordnet

def synonyms(x):
    synonyms_list = [x]
    for syn in wordnet.synsets(x):
        for s in syn.lemmas():
            synonyms_list.append(s.name())
    return set(synonyms_list)

def bincount(x, f):
    for _ in x.split():
        if _ in synonyms(f):
            return 1
    return 0

In [13]:
i = 0
for f in feats:
    i += 1
    print(f'{i:{3}} / {n}')
    data[f] = data['news_c'].apply(lambda x : bincount(x, f))
    clear_output(wait=True)

485 / 485


In [14]:
X = data[feats]
Y = data[labels]

# Machine Learning

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import BernoulliNB

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [17]:
for col in labels:
    print(col)
    model = BernoulliNB()
    model.fit(X_train, Y_train[col])
    y_pred = model.predict(X_test)
    print(confusion_matrix(Y_test[col], y_pred))
    print(classification_report(Y_test[col], y_pred))

national
[[1583  387]
 [ 251  548]]
              precision    recall  f1-score   support

           0       0.86      0.80      0.83      1970
           1       0.59      0.69      0.63       799

   micro avg       0.77      0.77      0.77      2769
   macro avg       0.72      0.74      0.73      2769
weighted avg       0.78      0.77      0.77      2769

sports
[[2147  107]
 [ 127  388]]
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      2254
           1       0.78      0.75      0.77       515

   micro avg       0.92      0.92      0.92      2769
   macro avg       0.86      0.85      0.86      2769
weighted avg       0.91      0.92      0.91      2769

world
[[2593    0]
 [   0  176]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2593
           1       1.00      1.00      1.00       176

   micro avg       1.00      1.00      1.00      2769
   macro avg       1.00  

In [18]:
clf = [BernoulliNB()]*7
for i in range(7):
    clf[i].fit(X, Y[labels[i]])

# Predictions

In [21]:
def classify(q):
    q = cleanText(str(q), True, False, False, False)
    feat_q = []
    for f in feats:
        feat_q.append(bincount(q, f))
    print(feat_q)
    class_tags= []
    for i in range(7):
        if clf[i].predict([feat_q])[0]:
            class_tags.append(labels[i])
    return class_tags

In [22]:
classify('I thought India will score about 400 runs today. Boring start #BeThe12thMan #INDvsAFG @CocaCola_Ind')

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

[]

In [24]:
clf[0].feature_log_prob_ 

array([[-5.79181516, -5.17277595, -5.5686716 , -4.81995457, -4.49904685,
        -5.8281828 , -4.01203188, -8.02540738, -4.26420726, -4.25648522,
        -5.8281828 , -5.98852545, -5.25281865, -5.4104476 , -4.74199303,
        -5.94596584, -5.8281828 , -5.27387206, -6.07949723, -3.66869855,
        -4.90451196, -6.48496234, -4.12680739, -5.17277595, -5.48643351,
        -3.8875777 , -4.79328633, -3.69467404, -6.55907031, -5.15372775,
        -6.07949723, -5.5686716 , -5.72282228, -3.90908391, -5.75672384,
        -4.98088494, -4.21136478, -5.72282228, -3.85102011, -3.18648346,
        -4.78021424, -5.29537827, -4.33652792, -7.04457812, -5.19219403,
        -4.51884948, -3.64338074, -6.17958069, -3.78648159, -5.21199666,
        -5.8281828 , -5.29537827, -5.46045802, -5.59765914, -5.59765914,
        -5.15372775, -4.28773776, -4.80653155, -4.49904685, -6.07949723,
        -4.24121774, -5.75672384, -4.28773776, -6.63911302, -5.0809684 ,
        -4.75457181, -4.06142463, -4.04884585, -8.4