In [1]:
import numpy as np 
import pandas as pd
import re
import string
from tensorflow import keras
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [191]:
from sklearn.metrics import precision_score,recall_score,confusion_matrix

In [2]:
bbc_data = pd.read_csv("bbc.csv")
bbc_data.head()

Unnamed: 0,Article,Class
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    """
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'ve", "", string)
    string = re.sub(r"n\'t", "", string)
    string = re.sub(r"\'re", "", string)
    string = re.sub(r"\'d", "", string)
    string = re.sub(r"\'ll", "", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", "", string)
    string = re.sub(r"'", "", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"[0-9]\w+|[0-9]","", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [4]:
i = 0
articles = []
for article in bbc_data['Article']:
    article = clean_str(article)
    bbc_data["Article"][i] = article
    i += 1

In [5]:
bbc_data["Class"].value_counts()

business         510
politics         417
tech             401
entertainment    386
sport            198
Name: Class, dtype: int64

In [168]:
encode = {
    "business" : 0,
    "politics"  : 1,       
    "tech" : 2,    
    "entertainment" : 3,     
    "sport"  : 4
}
bbc_data["Class_id"] = bbc_data["Class"].apply(lambda x: encode[x])
article_labels = bbc_data['Class_id'].values
bbc_data.head()

Unnamed: 0,Article,Class,Class_id
0,ad sales boost time warner profit quarterly pr...,business,0
1,dollar gains on greenspan speech the dollar ha...,business,0
2,yukos unit buyer faces loan claim the owners o...,business,0
3,high fuel prices hit ba profits british airway...,business,0
4,pernod takeover talk lifts domecq shares in uk...,business,0


In [166]:
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=10)
article_vec = vectorizer.fit_transform(bbc_data['Article']).toarray()
print(article_vec.shape)

(1912, 5130)


In [186]:
# creating an array of class labels of articles
class_articles = article_labels

# randomly suffling articles and their class labels
rows = article_vec.shape[0]

indices = np.arange(rows)
np.random.shuffle(indices)
articles = article_vec[indices]
class_articles = class_articles[indices]

num_training_samples = int(0.7 * rows)
num_validation_samples = int(0.1 * rows)

X_train = articles[:num_training_samples]
y_train = class_articles[:num_training_samples]

mark = (num_training_samples+num_validation_samples)

X_val = articles[num_training_samples:mark]
y_val = class_articles[num_training_samples:mark]

X_test = articles[mark:]
y_test = class_articles[mark:]

In [187]:
print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)

print('Shape of X_val:', X_val.shape)
print('Shape of y_val:', y_val.shape)

print('Shape of X_test:', X_test.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_train: (1338, 5130)
Shape of y_train: (1338,)
Shape of X_val: (191, 5130)
Shape of y_val: (191,)
Shape of X_test: (383, 5130)
Shape of y_test: (383,)


In [159]:
optimizers = ['SGD','Adam','Adamax']
activation_func = ['tanh','relu']
op_actv_func = ['sigmoid','softmax']

In [188]:
models = []
for opt in optimizers:
    for a_f in activation_func:
        for o_a_f in op_actv_func:
            model = keras.models.Sequential([
              keras.layers.Dense(20, activation=a_f, input_shape=(X_train.shape[1],)),
              keras.layers.Dense(20, activation=a_f),  
              keras.layers.Dense(5, activation=o_a_f),
            ])
            model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
            models.append(model)

In [189]:
i = 0
for opt in optimizers:
    print("For Optimization Function:",opt)
    for a_f in activation_func:
        print("\n\tActivation Function at hidden layers:",a_f)
        for o_a_f in op_actv_func:
            print("\n\t\tActivation Function at output layer:",o_a_f)
            model = models[i]
            model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val), verbose = 0)
            y_pred = model.predict(X_test)
            Y_pred = []
            for pred in y_pred:
                Y_pred.append(np.where(pred == np.max(pred))[0][0])
            print("\t\tTest Accuracy",100*accuracy_score(y_test, Y_pred))
            print("\t\tTest precision",100*precision_score(y_test, Y_pred,average='macro'))
            print("\t\tTest recall",100*recall_score(y_test, Y_pred,average='macro'))
            i+=1

For Optimization Function: SGD

	Activation Function at hidden layers: tanh

		Activation Function at output layer: sigmoid
		Test Accuracy 84.59530026109661
		Test precision 68.54377950655939
		Test recall 74.7400725025257

		Activation Function at output layer: softmax


  _warn_prf(average, modifier, msg_start, len(result))


		Test Accuracy 86.16187989556136
		Test precision 68.94607768623516
		Test recall 76.30260890235931

	Activation Function at hidden layers: relu

		Activation Function at output layer: sigmoid


  _warn_prf(average, modifier, msg_start, len(result))


		Test Accuracy 81.98433420365535
		Test precision 65.22271538610254
		Test recall 71.96034544322275

		Activation Function at output layer: softmax


  _warn_prf(average, modifier, msg_start, len(result))


		Test Accuracy 79.89556135770235
		Test precision 65.58476150941902
		Test recall 71.20684174543413
For Optimization Function: Adam

	Activation Function at hidden layers: tanh

		Activation Function at output layer: sigmoid


  _warn_prf(average, modifier, msg_start, len(result))


		Test Accuracy 96.60574412532638
		Test precision 96.8944959452206
		Test recall 96.72596811946386

		Activation Function at output layer: softmax
		Test Accuracy 97.12793733681463
		Test precision 97.34497965750549
		Test recall 97.35942225704687

	Activation Function at hidden layers: relu

		Activation Function at output layer: sigmoid
		Test Accuracy 96.8668407310705
		Test precision 97.31382113821138
		Test recall 97.0627086420234

		Activation Function at output layer: softmax
		Test Accuracy 96.8668407310705
		Test precision 97.22515371721417
		Test recall 97.16282944627308
For Optimization Function: Adamax

	Activation Function at hidden layers: tanh

		Activation Function at output layer: sigmoid
		Test Accuracy 96.8668407310705
		Test precision 97.30301032740056
		Test recall 97.07773211620182

		Activation Function at output layer: softmax
		Test Accuracy 97.12793733681463
		Test precision 97.42455266707071
		Test recall 97.32937530869008

	Activation Function at hidden lay

In [208]:
kfold = KFold(n_splits=3,shuffle=True,random_state=1)
Accuracy = []

for train,test in kfold.split(article_vec):
    X_train,X_test,Y_train,Y_test = article_vec[train],article_vec[test],article_labels[train],article_labels[test]
    model = keras.models.Sequential([
              keras.layers.Dense(20, activation='relu', input_shape=(X_train.shape[1],)),
              keras.layers.Dense(20, activation='relu'),  
              keras.layers.Dense(5, activation='softmax'),
            ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])    
    model.fit(X_train, Y_train, epochs=5, verbose = 0)
    y_pred = model.predict(X_test)
    Y_pred = []
    for pred in y_pred:
        Y_pred.append(np.where(pred == np.max(pred))[0][0])
    Accuracy.append(accuracy_score(Y_test, Y_pred))
    cm = confusion_matrix(Y_test, Y_pred, labels = [0,1,2,3,4])
    print("\nClass-wise accuracy:\n")
    for cls in encode:
        key = encode[cls]
        print("\t",cls)
        print("\t\t",100*cm[key,key]/sum(cm[key]),"\n")
    
print("Overall 3-Fold Accuracy:",100*np.sum(Accuracy)/3)


Class-wise accuracy:

	 business
		 97.53086419753086 

	 politics
		 98.62068965517241 

	 tech
		 98.4375 

	 entertainment
		 96.26865671641791 

	 sport
		 98.55072463768116 


Class-wise accuracy:

	 business
		 95.65217391304348 

	 politics
		 97.12230215827338 

	 tech
		 97.72727272727273 

	 entertainment
		 95.04132231404958 

	 sport
		 100.0 


Class-wise accuracy:

	 business
		 95.73170731707317 

	 politics
		 96.2406015037594 

	 tech
		 95.74468085106383 

	 entertainment
		 98.47328244274809 

	 sport
		 100.0 

Overall 3-Fold Accuracy: 97.12307397036462
