In [None]:
!pip install -U scikit-learn==0.24.1
!pip install gensim==4.0.1

In [8]:
import pandas as pd
import sqlite3

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn

In [10]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')

### Lemmatization by noun (further preprocessing)
def process_n(text):
    txt = []
    word_tokens = word_tokenize(text) 
    for w in word_tokens:
        txt.append(lemmatizer.lemmatize(w, pos ='n'))
    text_p = " ".join(txt)
    return text_p

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [11]:
### this code extract article text with its label from sqlite table and store in the list Articles and Class
rows = []
try:
    conn = create_connection('/content/drive/MyDrive/ArticlesDatabaseM.db')
    curr = conn.cursor()
    query = """SELECT article,label FROM articles_tb EXCEPT SELECT article,label FROM articles_tb WHERE label = "Chennai" OR label = "Karnataka" OR label = "Kerala" OR label = "Tamil Nadu" OR label = "Hyderabad" OR label = "Mumbai" OR label = "Bengaluru"  """
    curr.execute(query)
    rows = curr.fetchall()

    conn.commit()
    print("Python Variables successfully selected from articles_link_tb " )

    curr.close()

except sqlite3.Error as error:
    print("Failed to select Python variable from sqlite table", error)
finally:
      conn.close()
      print("The SQLite connection is closed")

Articles = []
Class = []
for row in rows:
    Articles.append(process_n(row[0]))
    Class.append(row[1])

Python Variables successfully selected from articles_link_tb 
The SQLite connection is closed


In [12]:
### creating pandas dataframe from the extracted data
data = {'Article':Articles,
		'Class':Class}

# Create DataFrame
df = pd.DataFrame(data)

# Print the output.
print(df)

                                                 Article          Class
0                                                                 Books
1                                                               Cricket
2                                                             Education
3                                                             Elections
4                                                         Entertainment
...                                                  ...            ...
48958  zumba bring back gate hell balance equation li...        Society
48959  çelebi aviation hold grind handle licence hyde...       Business
48960  école intuit lab open application école intuit...      Education
48961  ït plan shoot say agasthya karthikeyan photogr...   Life & Style
48962  škoda auto india unveil refresh superb range n...       Business

[48963 rows x 2 columns]


In [13]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

label_encoder = preprocessing.LabelEncoder() 
data['Category_Class']= label_encoder.fit_transform(data['Class'])

In [14]:
Thehindu = pd.DataFrame()
Thehindu['Article'] = data['Article']
Thehindu['Class'] = data['Class']
Thehindu['Category_Class'] = data['Category_Class']
print(Thehindu['Class'].value_counts())
Thehindu.sample(5, random_state=0)


Life & Style     4641
Business         4434
Sci-Tech         3772
Entertainment    3723
Other Sports     3664
Elections        3593
Society          3591
Tennis           2915
Education        2743
International    2726
National         2520
Football         2491
Cricket          2437
Books            1944
Health           1905
Environment      1864
Name: Class, dtype: int64


Unnamed: 0,Article,Class,Category_Class
18336,high voltage entertainment former national cha...,Tennis,15
36003,russian track federation president step aside ...,Other Sports,12
9336,dance prince lady good catch eye horse exercis...,Other Sports,12
11325,duet hindustani classical music gimmick case e...,Entertainment,5
20536,indian tourist plan vacation southeast asia fa...,Life & Style,10


In [15]:
class_num = []
for x in range(16):
  class_num.append(x)

class_list = list(label_encoder.inverse_transform(class_num))
print(class_list)


['Books', 'Business', 'Cricket', 'Education', 'Elections', 'Entertainment', 'Environment', 'Football', 'Health', 'International', 'Life & Style', 'National', 'Other Sports', 'Sci-Tech', 'Society', 'Tennis']


In [16]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re


  from pandas import Panel


In [17]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []

    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), [label]))
    
    return labeled


X_train, X_test, y_train, y_test = train_test_split(Thehindu.Article, Thehindu.Category_Class, random_state=80, test_size=0.25)

X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [24]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative = 5 , min_count= 2 , alpha=0.065, min_alpha=0.065, workers = 16)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 48963/48963 [00:00<00:00, 2383040.99it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2523788.36it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2172791.21it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2324587.77it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2462359.50it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2772028.17it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2382626.28it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2364847.33it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2315910.81it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2277311.87it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2440645.88it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2586925.99it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2468189.49it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2615690.48it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2407372.28it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2487924.24it/s]
100%|██████████| 48963/48963 [00:00<00:00, 2364629.49it/

In [20]:
import numpy as np
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.dv[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [23]:
from sklearn.neural_network import MLPClassifier

clf_neural = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(250,150,50), random_state=1 , verbose = True, activation = 'relu')

clf_neural = clf_neural.fit(train_vectors_dbow, y_train)
model_prediction_neural = clf_neural.predict(test_vectors_dbow)
model_train_pred_neural = clf_neural.predict(train_vectors_dbow)

Iteration 1, loss = 1.04341484
Iteration 2, loss = 0.52079026
Iteration 3, loss = 0.45382889
Iteration 4, loss = 0.40325121
Iteration 5, loss = 0.35249930
Iteration 6, loss = 0.30145523
Iteration 7, loss = 0.25089317
Iteration 8, loss = 0.19919706
Iteration 9, loss = 0.15421977
Iteration 10, loss = 0.11180061
Iteration 11, loss = 0.07367363
Iteration 12, loss = 0.04811834
Iteration 13, loss = 0.03262227
Iteration 14, loss = 0.01884664
Iteration 15, loss = 0.01153761
Iteration 16, loss = 0.00738977
Iteration 17, loss = 0.00639589
Iteration 18, loss = 0.00540695
Iteration 19, loss = 0.00545559
Iteration 20, loss = 0.00436054
Iteration 21, loss = 0.00407826
Iteration 22, loss = 0.00354813
Iteration 23, loss = 0.00347560
Iteration 24, loss = 0.12849201
Iteration 25, loss = 0.09492353
Iteration 26, loss = 0.03165399
Iteration 27, loss = 0.00948648
Iteration 28, loss = 0.00393600
Iteration 29, loss = 0.00234713
Iteration 30, loss = 0.00162625
Iteration 31, loss = 0.00134054
Iteration 32, los

In [25]:
print('accuracy on training set %s' % accuracy_score(model_train_pred_neural, y_train))
print('accuracy on test set %s' % accuracy_score(model_prediction_neural, y_test))


accuracy on training set 0.9998093785741518
accuracy on test set 0.813495629442039


In [46]:
print(classification_report(y_test, model_prediction_neural, target_names= class_list))

               precision    recall  f1-score   support

        Books       0.76      0.79      0.77       490
     Business       0.87      0.88      0.88      1109
      Cricket       0.95      0.96      0.95       610
    Education       0.79      0.79      0.79       671
    Elections       0.90      0.94      0.92       919
Entertainment       0.79      0.78      0.78       919
  Environment       0.71      0.72      0.71       492
     Football       0.96      0.96      0.96       614
       Health       0.67      0.70      0.69       460
International       0.82      0.80      0.81       711
 Life & Style       0.70      0.73      0.71      1129
     National       0.69      0.65      0.67       624
 Other Sports       0.94      0.95      0.94       912
     Sci-Tech       0.77      0.76      0.76       992
      Society       0.67      0.61      0.64       905
       Tennis       0.97      0.98      0.98       684

     accuracy                           0.81     12241
    macr

In [18]:
from gensim.test.utils import get_tmpfile

fname = get_tmpfile("/content/drive/MyDrive/doc2vec_model2")

In [None]:
model_dbow.save(fname)

In [19]:
model_dbow = Doc2Vec.load(fname)

In [27]:
from joblib import dump, load
dump(clf_neural, '/content/drive/MyDrive/clf_neural_new2.joblib') 

In [28]:
clf2 = load('/content/drive/MyDrive/clf_neural_new2.joblib') 
model_prediction_neural = clf2.predict(test_vectors_dbow)
model_train_pred_neural = clf2.predict(train_vectors_dbow)
print('accuracy on training set %s' % accuracy_score(model_train_pred_neural, y_train))
print('accuracy on test set %s' % accuracy_score(model_prediction_neural, y_test))


accuracy on training set 0.9998093785741518
accuracy on test set 0.813495629442039


In [40]:
text = 'indias export rise billion june account healthy growth shipments sectors engineer gems jewellery petroleum products accord preliminary data commerce ministry import rise billion period data show export sectors engineer gems jewellery petroleum products record healthy growth rat export grow billion first week month billion second week month accord data export april may fiscal year jump billion billion period last year'

In [41]:
word_tokens = word_tokenize(text) 
vector = model_dbow.infer_vector(word_tokens)
vector = vector.reshape(1, -1)
model_prediction = clf_neural.predict(vector)
class_list[model_prediction[0]]

'Business'