<a href="https://colab.research.google.com/github/SrishtiPatil/supervised-methods-for-text-categorization/blob/main/TopicClassificationAlgo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,f1_score
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_path = '/topics1.xlsx'
validation_path = '/valid_tagged.xlsx'
test_path = '/test_tagged.xlsx'
xls = pd.ExcelFile(train_path)
df_train = xls.parse('Sheet1')
xls = pd.ExcelFile(validation_path)
df_val = xls.parse('Sheet1')
xls = pd.ExcelFile(test_path)
df_test = xls.parse('Sheet1')

In [None]:
df_train.head()

Unnamed: 0,speaker,message,tags
0,0,Are you a fan of Google or Microsoft?,"'PERSON','ORG','NORP'"
1,1,Both are excellent technology they are helpful...,"'ORG',"
2,0,"I'm not a huge fan of Google, but I use it a ...","'GPE','ORG'"
3,1,Google provides online related services and pr...,"'PERSON','LOC','NORP'"
4,0,"Yeah, their services are good. I'm just not a ...","'ORG',"


In [None]:
import re
def clean_text(text):
  text = text.lower()
  text = re.sub(r"\'s", " ", text)
  text = re.sub(r"\'ve", " have ", text)
  text = re.sub(r"can't", "can not ", text)    
  text = re.sub(r"n't", " not ", text)
  text = re.sub(r"\'d", " would ", text)
  text = re.sub(r"\'ll", " will ", text)
  text = re.sub(r"i'm", "i am ", text)
  text = re.sub(r"\'re", " are ", text)
  text = text.strip(' ')
  return text

df_train['message'] = df_train['message'].map(lambda msg : clean_text(str(msg)))
df_test['message'] = df_test['message'].map(lambda msg : clean_text(str(msg)))
df_val['message'] = df_val['message'].map(lambda msg : clean_text(str(msg)))

In [None]:
y_train, y_test, y_val = ([], ) * 3
for i in df_train['tags']:
  y_train.append(eval(i))

for i in df_val['tags']:
  y_val.append(eval(i))

for i in df_test['tags']:
  y_test.append(eval(i))



In [None]:
y_test[0:20]

[('PERSON', 'ORG', 'NORP'),
 ('ORG',),
 ('GPE', 'ORG'),
 ('PERSON', 'LOC', 'NORP'),
 ('ORG',),
 ('GPE', 'PERSON', 'ORG', 'NORP'),
 ('PERSON', 'ORG', 'NORP'),
 ('GPE', 'PERSON', 'ORG', 'NORP'),
 ('PERSON', 'ORG', 'DATE', 'NORP'),
 ('QUANTITY', 'PERSON', 'NORP', 'MONEY', 'CARDINAL'),
 ('DATE',),
 ('ORG', 'PERSON'),
 ('PERSON', 'ORG', 'LOC', 'NORP'),
 ('GPE', 'ORG', 'PERSON'),
 ('ORG', 'NORP'),
 ('ORG', 'PERSON'),
 ('ORG', 'DATE', 'PERSON'),
 ('ORG',),
 ('GPE', 'ORG', 'DATE', 'PERSON'),
 ('PERSON',)]

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb_train = mlb.fit_transform(y_train)
mlb_test = mlb.fit_transform(y_test)
mlb_val = mlb.fit_transform(y_val)
cols = mlb.classes_

In [None]:
print(mlb_train[0:10,:])

[[0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0]
 [1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0]]


In [None]:
print(cols)

['CARDINAL' 'DATE' 'EVENT' 'FAC' 'GPE' 'LANGUAGE' 'LAW' 'LOC' 'MONEY'
 'NORP' 'ORDINAL' 'ORG' 'PERCENT' 'PERSON' 'PRODUCT' 'QUANTITY' 'TIME'
 'WORK_OF_ART']


In [None]:
df_temp = pd.DataFrame(mlb_train)
df_train = df_train.join(df_temp)
df_temp = pd.DataFrame(mlb_test)
df_test = df_test.join(df_temp)
df_temp = pd.DataFrame(mlb_val)
df_val = df_val.join(df_temp)
df_train.columns = ['speaker','message','tags','CARDINAL','DATE','EVENT','FAC','GPE','LANGUAGE','LAW','LOC','MONEY','NORP','ORDINAL','ORG','PERCENT','PERSON','PRODUCT','QUANTITY','TIME','WORK_OF_ART']
df_test.columns = ['speaker','message','tags','CARDINAL','DATE','EVENT','FAC','GPE','LANGUAGE','LAW','LOC','MONEY','NORP','ORDINAL','ORG','PERCENT','PERSON','PRODUCT','QUANTITY','TIME','WORK_OF_ART']
df_val.columns = ['speaker','message','tags','CARDINAL','DATE','EVENT','FAC','GPE','LANGUAGE','LAW','LOC','MONEY','NORP','ORDINAL','ORG','PERCENT','PERSON','PRODUCT','QUANTITY','TIME','WORK_OF_ART']

In [None]:
X_train = df_train['message']
y_train = df_train[['CARDINAL','DATE','EVENT','FAC','GPE','LANGUAGE','LAW','LOC','MONEY','NORP','ORDINAL','ORG','PERCENT','PERSON','PRODUCT','QUANTITY','TIME','WORK_OF_ART']]
X_test = df_test['message']
y_test = df_test[['CARDINAL','DATE','EVENT','FAC','GPE','LANGUAGE','LAW','LOC','MONEY','NORP','ORDINAL','ORG','PERCENT','PERSON','PRODUCT','QUANTITY','TIME','WORK_OF_ART']]
X_val = df_val['message']
y_val = df_val[['CARDINAL','DATE','EVENT','FAC','GPE','LANGUAGE','LAW','LOC','MONEY','NORP','ORDINAL','ORG','PERCENT','PERSON','PRODUCT','QUANTITY','TIME','WORK_OF_ART']]

In [None]:
accuracy_mat = pd.DataFrame()

In [None]:
temp_val = []
#Implementing OneVsRest Naive Bayes
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
for c in cols:
    print('Training for tag {}'.format(c))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, y_train[c])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_val)
    temp_val.append(accuracy_score(y_val[c], prediction))

accuracy_mat['Naive_Bayes_val'] = temp_val

Training for tag CARDINAL
Training for tag DATE
Training for tag EVENT
Training for tag FAC
Training for tag GPE
Training for tag LANGUAGE
Training for tag LAW
Training for tag LOC
Training for tag MONEY
Training for tag NORP
Training for tag ORDINAL
Training for tag ORG
Training for tag PERCENT
Training for tag PERSON
Training for tag PRODUCT
Training for tag QUANTITY
Training for tag TIME
Training for tag WORK_OF_ART


In [None]:
from sklearn.svm import LinearSVC
temp_val=[]
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
for c in cols:
    print('Training for tag {}'.format(c))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, y_train[c])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_val)
    temp_val.append(accuracy_score(y_val[c], prediction))

accuracy_mat['LinearSVC val'] = temp_val

Training for tag CARDINAL
Training for tag DATE
Training for tag EVENT
Training for tag FAC
Training for tag GPE
Training for tag LANGUAGE
Training for tag LAW
Training for tag LOC
Training for tag MONEY
Training for tag NORP
Training for tag ORDINAL
Training for tag ORG
Training for tag PERCENT
Training for tag PERSON
Training for tag PRODUCT
Training for tag QUANTITY
Training for tag TIME
Training for tag WORK_OF_ART


In [None]:
from sklearn.linear_model import LogisticRegression
temp_val=[]
prediction = []
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

for c in cols:
    print('Training for tag {}'.format(c))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, y_train[c])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_val)
    temp_val.append(accuracy_score(y_val[c], prediction))

accuracy_mat['LogReg val'] = temp_val

Training for tag CARDINAL
Training for tag DATE
Training for tag EVENT
Training for tag FAC
Training for tag GPE
Training for tag LANGUAGE
Training for tag LAW
Training for tag LOC
Training for tag MONEY
Training for tag NORP
Training for tag ORDINAL
Training for tag ORG
Training for tag PERCENT
Training for tag PERSON
Training for tag PRODUCT
Training for tag QUANTITY
Training for tag TIME
Training for tag WORK_OF_ART


In [None]:
accuracy_mat['tags'] = mlb.classes_

In [None]:
accuracy_mat

Unnamed: 0,Naive_Bayes_val,LinearSVC val,LogReg val,tags
0,0.760808,0.707131,0.713552,CARDINAL
1,0.690951,0.633935,0.635391,DATE
2,0.964815,0.956853,0.960534,EVENT
3,0.962589,0.88126,0.899837,FAC
4,0.504837,0.509888,0.510487,GPE
5,0.992295,0.991525,0.991867,LANGUAGE
6,0.981423,0.960106,0.960962,LAW
7,0.871758,0.747111,0.759695,LOC
8,0.978769,0.971235,0.974146,MONEY
9,0.717233,0.63462,0.644979,NORP


In [None]:
acc_mat = accuracy_mat[['tags','Naive_Bayes_val'	,'LinearSVC val',	'LogReg val']]
acc_mat

Unnamed: 0,tags,Naive_Bayes_val,LinearSVC val,LogReg val
0,CARDINAL,0.760808,0.707131,0.713552
1,DATE,0.690951,0.633935,0.635391
2,EVENT,0.964815,0.956853,0.960534
3,FAC,0.962589,0.88126,0.899837
4,GPE,0.504837,0.509888,0.510487
5,LANGUAGE,0.992295,0.991525,0.991867
6,LAW,0.981423,0.960106,0.960962
7,LOC,0.871758,0.747111,0.759695
8,MONEY,0.978769,0.971235,0.974146
9,NORP,0.717233,0.63462,0.644979
