In [None]:
!pip install -U scikit-learn==0.24.1

In [2]:
import pandas as pd
import sqlite3

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn

In [5]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')

### Lemmatization by noun (further preprocessing)
def process_n(text):
    txt = []
    word_tokens = word_tokenize(text) 
    for w in word_tokens:
        txt.append(lemmatizer.lemmatize(w, pos ='n'))
    text_p = " ".join(txt)
    return text_p

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [6]:
### this code extract article text with its label from sqlite table and store in the list Articles and Class
rows = []
try:
    conn = create_connection('/content/drive/MyDrive/ArticlesDatabaseM.db')
    curr = conn.cursor()
    query = """SELECT article,label FROM articles_tb EXCEPT SELECT article,label FROM articles_tb WHERE label = "Chennai" OR label = "Karnataka" OR label = "Kerala" OR label = "Tamil Nadu" OR label = "Hyderabad" OR label = "Mumbai" OR label = "Bengaluru"  """
    curr.execute(query)
    rows = curr.fetchall()

    conn.commit()
    print("Python Variables successfully selected from articles_link_tb " )

    curr.close()

except sqlite3.Error as error:
    print("Failed to select Python variable from sqlite table", error)
finally:
      conn.close()
      print("The SQLite connection is closed")

Articles = []
Class = []
for row in rows:
    Articles.append(process_n(row[0]))
    Class.append(row[1])

Python Variables successfully selected from articles_link_tb 
The SQLite connection is closed


In [7]:
### creating pandas dataframe from the extracted data
data = {'Article':Articles,
		'Class':Class}

# Create DataFrame
df = pd.DataFrame(data)

# Print the output.
print(df)

                                                 Article          Class
0                                                                 Books
1                                                               Cricket
2                                                             Education
3                                                             Elections
4                                                         Entertainment
...                                                  ...            ...
48958  zumba bring back gate hell balance equation li...        Society
48959  çelebi aviation hold grind handle licence hyde...       Business
48960  école intuit lab open application école intuit...      Education
48961  ït plan shoot say agasthya karthikeyan photogr...   Life & Style
48962  škoda auto india unveil refresh superb range n...       Business

[48963 rows x 2 columns]


In [8]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

label_encoder = preprocessing.LabelEncoder() 
data['Category_Class']= label_encoder.fit_transform(data['Class'])

In [9]:
Thehindu = pd.DataFrame()
Thehindu['Article'] = data['Article']
Thehindu['Class'] = data['Class']
Thehindu['Category_Class'] = data['Category_Class']
print(Thehindu['Class'].value_counts())
Thehindu.sample(5, random_state=0)

Life & Style     4641
Business         4434
Sci-Tech         3772
Entertainment    3723
Other Sports     3664
Elections        3593
Society          3591
Tennis           2915
Education        2743
International    2726
National         2520
Football         2491
Cricket          2437
Books            1944
Health           1905
Environment      1864
Name: Class, dtype: int64


Unnamed: 0,Article,Class,Category_Class
18336,high voltage entertainment former national cha...,Tennis,15
36003,russian track federation president step aside ...,Other Sports,12
9336,dance prince lady good catch eye horse exercis...,Other Sports,12
11325,duet hindustani classical music gimmick case e...,Entertainment,5
20536,indian tourist plan vacation southeast asia fa...,Life & Style,10


In [10]:
class_num = []
for x in range(16):
  class_num.append(x)

class_list = list(label_encoder.inverse_transform(class_num))
print(class_list)

['Books', 'Business', 'Cricket', 'Education', 'Elections', 'Entertainment', 'Environment', 'Football', 'Health', 'International', 'Life & Style', 'National', 'Other Sports', 'Sci-Tech', 'Society', 'Tennis']


In [11]:
import pickle
from sklearn.feature_extraction.text import CountVectorizer


#GET VECTOR COUNT
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(Thehindu.Article)


In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

#TRANSFORM WORD VECTOR TO TF IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, Thehindu.Category_Class, test_size=0.25, random_state=42)

In [16]:
X_train.shape

(36722, 214370)

In [17]:
y_train.shape

(36722,)

In [19]:
from sklearn.neural_network import MLPClassifier

clf_neural = MLPClassifier( alpha=1e-5, hidden_layer_sizes=(100,50), random_state=1, verbose = True , activation = 'relu')

clf_neural = clf_neural.fit(X_train, y_train)
model_prediction_neural = clf_neural.predict(X_test)
model_train_pred_neural = clf_neural.predict(X_train)

Iteration 1, loss = 1.36770474
Iteration 2, loss = 0.31438625
Iteration 3, loss = 0.12060038
Iteration 4, loss = 0.04392108
Iteration 5, loss = 0.01901581
Iteration 6, loss = 0.01103989
Iteration 7, loss = 0.00733460
Iteration 8, loss = 0.00585118
Iteration 9, loss = 0.00475521
Iteration 10, loss = 0.00397098
Iteration 11, loss = 0.00374809
Iteration 12, loss = 0.00340828
Iteration 13, loss = 0.00313277
Iteration 14, loss = 0.00280444
Iteration 15, loss = 0.00305886
Iteration 16, loss = 0.00278191
Iteration 17, loss = 0.00266400
Iteration 18, loss = 0.00266689
Iteration 19, loss = 0.00233027
Iteration 20, loss = 0.00236825
Iteration 21, loss = 0.00248455
Iteration 22, loss = 0.00214052
Iteration 23, loss = 0.00249962
Iteration 24, loss = 0.00241671
Iteration 25, loss = 0.00319207
Iteration 26, loss = 0.00207648
Iteration 27, loss = 0.00215778
Iteration 28, loss = 0.00244980
Iteration 29, loss = 0.00209449
Iteration 30, loss = 0.00240606
Iteration 31, loss = 0.00368124
Iteration 32, los

In [20]:
print('accuracy on training set %s' % accuracy_score(model_train_pred_neural, y_train))
print('accuracy on test set %s' % accuracy_score(model_prediction_neural, y_test))

accuracy on training set 0.9996732204128316
accuracy on test set 0.8382485091087329


In [21]:
print(classification_report(y_test, model_prediction_neural, target_names= class_list))

               precision    recall  f1-score   support

        Books       0.87      0.71      0.78       544
     Business       0.90      0.91      0.90      1078
      Cricket       0.98      0.98      0.98       633
    Education       0.81      0.90      0.85       686
    Elections       0.91      0.94      0.93       920
Entertainment       0.81      0.86      0.83       977
  Environment       0.71      0.77      0.74       473
     Football       0.99      0.99      0.99       636
       Health       0.73      0.66      0.69       475
International       0.81      0.85      0.83       642
 Life & Style       0.78      0.67      0.72      1164
     National       0.73      0.67      0.70       618
 Other Sports       0.97      0.97      0.97       911
     Sci-Tech       0.84      0.75      0.79       938
      Society       0.60      0.73      0.66       857
       Tennis       0.99      0.98      0.98       689

     accuracy                           0.84     12241
    macr

In [26]:
import pickle
pickle.dump(clf_neural, open("/content/drive/MyDrive/tfidf_clf_model.pkl", "wb"))

In [24]:
#SAVE WORD VECTOR
pickle.dump(count_vect.vocabulary_, open("/content/drive/MyDrive/count_vector.pkl","wb"))

In [25]:
#SAVE TF-IDF
pickle.dump(tfidf_transformer, open("/content/drive/MyDrive/tfidf.pkl","wb"))

In [27]:
from sklearn.metrics import confusion_matrix  

confusion_mat = confusion_matrix(y_test,model_prediction_neural)
print(confusion_mat)

[[386   3   5  11   0  33   4   0   1   5   7   4   1   7  77   0]
 [  0 984   0   6   1   1   6   0   3  23   6  32   0  15   1   0]
 [  1   1 622   0   0   1   0   1   0   2   0   3   0   0   1   1]
 [  5   3   0 620   1   5   1   0   5   1  13   2   0  11  19   0]
 [  0   2   0   0 862   0   1   0   1   4   0  41   0   1   8   0]
 [ 12   2   1   4   3 837   7   0   1   6  29   2   0   4  67   2]
 [  1   5   0   2   0   5 366   0   4   7  17   7   0  24  35   0]
 [  0   0   0   0   0   0   0 628   0   5   0   1   1   0   1   0]
 [  1   5   0   9   1   3   8   0 313  14  49  13   1  41  17   0]
 [  2  14   0   2   1   5  11   0   6 547   3  33   0  11   6   1]
 [ 10  11   2  27   1  62  35   3  39   3 784   4  10   9 163   1]
 [  3  43   1  12  62   4  11   0  15  30   0 416   0   8  13   0]
 [  0   0   2   1   2   0   0   4   0   3   4   4 884   0   3   4]
 [  4  25   1  43   4  15  52   1  34  16  18   5   3 707  10   0]
 [ 19   1   2  28   5  62  12   0   7   6  74   4   3   6 628 

In [29]:
text = 'indias export rise billion june account healthy growth shipments sectors engineer gems jewellery petroleum products accord preliminary data commerce ministry import rise billion period data show export sectors engineer gems jewellery petroleum products record healthy growth rat export grow billion first week month billion second week month accord data export april may fiscal year jump billion billion period last year'

In [30]:
#LOAD MODEL
loaded_vec = CountVectorizer(vocabulary=pickle.load(open("/content/drive/MyDrive/count_vector.pkl", "rb")))
loaded_tfidf = pickle.load(open("/content/drive/MyDrive/tfidf.pkl","rb"))
loaded_model = pickle.load(open("/content/drive/MyDrive/tfidf_clf_model.pkl","rb"))



docs_new = [text]

X_new_counts = loaded_vec.transform(docs_new)
X_new_tfidf = loaded_tfidf.transform(X_new_counts)
predicted = loaded_model.predict(X_new_tfidf)

print(class_list[predicted[0]])

Business
