This is a supervised multi-class text classification problem, in order to train the model to predict the correct text for each category they belong to, I used an SVM classifier as my model, and word frequency as the basic feature selection method accompanied by chi-square statistic and mutual information.

## PREPARATION

---

import all the libraries that we are going to use, including as usual numpy (vector manipulation), nltk (text preprocessing), scikit-learn (machine learning), os (pathname manipulations), random(shuffle dataset), operator (used for sort features).

In [2]:
import os
import nltk
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import random
from nltk.corpus import stopwords
import operator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

## A)OBTAIN DATA

---

Dataset file (BBC) contains 6 files included README.TXT, the other 5 each for one catagory, for the first step I take all the text with their catagory index (from 0 to 4) orderly.

In [3]:
path = os.getcwd() + "\\bbc"
list_catagory_files = os.listdir(path) # get bbc file
print(list_catagory_files)
num_catagory = len(list_catagory_files)

def text_label_list(cata_path):
    label = 0
    all_text_list = []
    all_label_list = []
    catagory_file_num = []
    
    for catagory in list_catagory_files:
        catagory_file = path + "\\" + catagory
        print(catagory_file)
        if catagory_file.endswith('.TXT'):
            continue
        else:
        
            file_num = len(os.listdir(catagory_file))
            catagory_file_num.append(file_num)
            print(file_num)
            print(catagory)
            for i in range(1,file_num + 1):
                all_label_list.append(label)
                j = "%03d" % i
                with open("\\".join([catagory_file, str(j) + ".txt"])) as fp:
                    lines = fp.read()
                    article = []
                    text_ = ""
                    text_ += lines
                    article.append(text_)    
                        
                        
                    all_text_list.append(article)
        label += 1
    return all_text_list, all_label_list, catagory_file_num

all_text_list_, all_label_list_, catagory_file_num = text_label_list(list_catagory_files)
print(all_text_list_[1313]) # the first element is the exact 001.txt

['business', 'entertainment', 'politics', 'README.TXT', 'sport', 'tech']
C:\Users\c2008016\Desktop\Programs\coursework\bbc\business
510
business
C:\Users\c2008016\Desktop\Programs\coursework\bbc\entertainment
386
entertainment
C:\Users\c2008016\Desktop\Programs\coursework\bbc\politics
417
politics
C:\Users\c2008016\Desktop\Programs\coursework\bbc\README.TXT
C:\Users\c2008016\Desktop\Programs\coursework\bbc\sport
511
sport
C:\Users\c2008016\Desktop\Programs\coursework\bbc\tech
401
tech
['Claxton hunting first major medal\n\nBritish hurdler Sarah Claxton is confident she can win her first major medal at next month\'s European Indoor Championships in Madrid.\n\nThe 25-year-old has already smashed the British record over 60m hurdles twice this season, setting a new mark of 7.96 seconds to win the AAAs title. "I am quite confident," said Claxton. "But I take each race as it comes. "As long as I keep up my training but not do too much I think there is a chance of a medal." Claxton has won th

## B)TEXT PREPROCESSING

---

Split each word from each text and regulate them to lemma form.

In [4]:
lemmatizer = nltk.stem.WordNetLemmatizer()
# take all the lemmatized words from each text as a new list
def get_list_tokens(text_list):
    paragraph = ",".join(text_list)  # turn list into string
    sentence_split=nltk.tokenize.sent_tokenize(paragraph) # get each sentence
    
    list_tokens=[]
#     get each word from each sentence
    for sentence in sentence_split:  
        list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)
        for token in list_tokens_sentence:
            list_tokens.append(lemmatizer.lemmatize(token).lower())
    return list_tokens

token = get_list_tokens(all_text_list_[0])
print(token)

['ad', 'sale', 'boost', 'time', 'warner', 'profit', 'quarterly', 'profit', 'at', 'us', 'medium', 'giant', 'timewarner', 'jumped', '76', '%', 'to', '$', '1.13bn', '(', 'â£600m', ')', 'for', 'the', 'three', 'month', 'to', 'december', ',', 'from', '$', '639m', 'year-earlier', '.', 'the', 'firm', ',', 'which', 'is', 'now', 'one', 'of', 'the', 'biggest', 'investor', 'in', 'google', ',', 'benefited', 'from', 'sale', 'of', 'high-speed', 'internet', 'connection', 'and', 'higher', 'advert', 'sale', '.', 'timewarner', 'said', 'fourth', 'quarter', 'sale', 'rose', '2', '%', 'to', '$', '11.1bn', 'from', '$', '10.9bn', '.', 'its', 'profit', 'were', 'buoyed', 'by', 'one-off', 'gain', 'which', 'offset', 'a', 'profit', 'dip', 'at', 'warner', 'bros', ',', 'and', 'le', 'user', 'for', 'aol', '.', 'time', 'warner', 'said', 'on', 'friday', 'that', 'it', 'now', 'owns', '8', '%', 'of', 'search-engine', 'google', '.', 'but', 'it', 'own', 'internet', 'business', ',', 'aol', ',', 'had', 'ha', 'mixed', 'fortune',

## C) TRAIN, DEVELOPMENT AND TEST SPLITS

---

Split all text into three different set: training set 80%, development set 10% and test set 10%.

In [6]:
# get full dataset in order to split our dataset into training and test
# intergrate text and lable in a (text,lable) form
dataset_full=[] 
for text, label in zip(all_text_list_, all_label_list_):
    dataset_full.append((text,label))
        

def get_train_test_split(dataset_full,ratio):
    pre_train_set=[]
    pre_test_set=[]
    
    size_dataset_full = len(dataset_full)
    test_size = int(round(size_dataset_full * ratio))
#     retrieve proportion of random datasets
    list_test_indices=random.sample(range(size_dataset_full), test_size)
    
    for i,text in enumerate(dataset_full):
        if i in list_test_indices:
            pre_test_set.append(text)
        else:pre_train_set.append(text)
        
    return pre_train_set,pre_test_set
# get training set, test set
train_set, test_set = get_train_test_split(dataset_full,0.2)
# print ("Size training set: "+str(len(train_set)))
# print ("Size test set: "+str(len(test_set)))

# get test set, development set
new_test_set, new_dev_set = get_train_test_split(test_set,0.5)

new_train_set=train_set
# shuffle each set to random sets
random.shuffle(new_train_set)
random.shuffle(new_dev_set)
random.shuffle(new_test_set)
print ("Size training set: "+str(len(new_train_set)))
print ("Size test set: "+str(len(new_test_set)))
print ("Size development set: "+str(len(new_dev_set)))


Size training set: 1780
Size test set: 223
Size development set: 222


## D)FEATURE ENGINEERING

---

Select feature by three different ways: word frequency, Hash trick and Bigram

### D.1)WORD FREQUENCY

---

Gater all the usefull words from all the text and make a vocabulary frequency dictionary, arrange the most used word to the least used word. So that every text will get a feature vector based on that vocabulary frequency dictionary.

In [7]:
# get the stopwords list from nltk and add some more
stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = ['/','(',')','{','}','@','|',';','\n','#','+','_','.',',','``',"''",':','-']
stopwords.extend(new_stopwords)



def get_vocabulary(training_set, num_features): 
    dict_word_frequency={}
    for instance in training_set:
        sentence_tokens=get_list_tokens(instance[0])
        for word in sentence_tokens:
            if word in stopwords: continue
            if word not in dict_word_frequency: dict_word_frequency[word]=1
            else: dict_word_frequency[word]+=1
    sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:num_features]
    vocabulary=[]
    for word,frequency in sorted_list:
        vocabulary.append(word)
    return vocabulary   #word frequency dictionary

dic = get_vocabulary(dataset_full,1000)
# show the top 15 
print(dic[:20])

["'s", 'said', 'wa', 'ha', 'mr', 'year', 'would', 'also', 'people', '%', 'new', 'one', 'us', 'could', 'game', 'last', 'time', 'first', 'say', "n't"]


based on that vocabulary frequency dictionary, each text would able to get a feature vector

In [11]:
def get_vector_text(list_vocab,text_list):
#     based on the vocabulary frequency dictionary we get the vector size
    vector_text=np.zeros(len(list_vocab))
#     turn text(list) into string
    paragraph = ",".join(text_list)
    list_tokens_string=get_list_tokens(paragraph)
    for i, word in enumerate(list_vocab):
        if word in list_tokens_string:
            vector_text[i]=list_tokens_string.count(word)
    return vector_text


### D.2)HASH TRICK

---

Finally, my last method used for feature extraction is the Hash trick, It is an outstanding method, because of its very low scalable to large datasets as there is no need to store a vocabulary dictionary in memory, by using HashingVectorizer from sklearn.feature_extraction.text , I changed the default feature number up to 500 to match the data size, generate then transform then we have the feature vector.

In [9]:

def get_hash_vec(training_set):
    
    hashing_vectorizer = HashingVectorizer(n_features = 500)
    text_all_vec = hashing_vectorizer.transform(training_set).toarray()
    
    return text_all_vec

### D.3)BIGRAM

---

The principle is to generate a dictionary with one and only sequence of two adjacent elements and words as the key, and their index as value. So that the feature vector would be very long. 

In [None]:

def get_bigram_vocab(bigram_form_list):
# get the corpus form text and generate a bigram dictionary
    
    bigram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2))
    text_all_vec = bigram_vectorizer.fit_transform(bigram_form_list)
    
    return bigram_vectorizer

def get_bigram_vec(bigram_form_list, vectorizer):
#     get the dictionary and the corpus form text to generate the whole vector
        vec = vectorizer.transform(bigram_form_list).toarray()
        vec = np.asarray(vec)
        return vec
    
    

def get_bigram_form(training_set):
#     get the training set tuple and make it compatible for CountVectorizer
    one_list = []
    label = []
    for instance in training_set:
        article = ''.join(instance[0])
        one_list.append(article)
        label.append(instance[1])
    
    return one_list, label

### D.4)CHI-SQUARE STATISTIC

---

Used for feature selection method to decrease the dimension.The principle is to test whether the observed frequency is significantly different from the expected frequency, If the chi-square value is larger, the degree of deviation between the two is larger; otherwise, the deviation between the two is smaller; if the two values are completely equal, the chi-square value is 0, indicating that the theoretical value is completely consistent.

In [10]:
# get training eigenvectors and return a smaller and better eigenvectors after chi-square process
def chi_square_select(X_train,Y_train,num_features):
    fs_sentanalysis = SelectKBest(chi2, k=num_features).fit(X_train, Y_train)
    X_train_new = fs_sentanalysis.transform(X_train)
    return fs_sentanalysis, X_train_new

## E)SVM TRAINING MODEL

---

Do the dimensionality reduction process before training the SVM classifier.

In [12]:
# combine three types of the dimensionality reduction method, feature number needed
def train_svm_classifier(training_set, vocabulary, chi_num): # Function for training our svm classifier
    X_train=[]
    Y_train=[]
    for instance in training_set:
        vector_instance=get_vector_text(vocabulary,instance[0])
        X_train.append(vector_instance)
        Y_train.append(instance[1])
        
    fs_sentanalysis, X_train_chi_vector = chi_square_select(X_train,Y_train,chi_num)
    
    
#   Finally, train the SVM classifier 
    svm_clf=sklearn.svm.SVC(kernel="linear",gamma='auto')
#     train the last processed eigenvectors 
    svm_clf.fit(X_train_chi_vector,Y_train)
    return svm_clf, fs_sentanalysis


vocabulary = get_vocabulary(dataset_full, 10000)
svm_clf, fs_sentanalysis = train_svm_classifier(new_train_set, vocabulary,5000)

test our model in the test set.

In [14]:
X_test=[]
Y_test=[]
for instance in new_test_set:   
    vector_instance=get_vector_text(vocabulary,instance[0])
    X_test.append(vector_instance)
    Y_test.append(instance[1])
X_test=np.asarray(X_test)
Y_test_gold=np.asarray(Y_test)


from sklearn.metrics import classification_report

Y_text_predictions = svm_clf.predict(fs_sentanalysis.transform(X_test))
print(Y_text_predictions)
print(classification_report(Y_test_gold, Y_text_predictions))


[4 0 0 0 4 1 2 4 3 2 0 0 1 3 1 1 0 1 0 2 3 3 2 0 2 3 4 3 0 0 3 3 1 3 3 4 1
 3 0 3 0 3 3 0 0 1 4 0 1 4 0 1 0 3 3 0 0 1 1 3 3 4 0 2 2 0 4 3 2 4 1 4 0 4
 0 0 1 4 3 4 0 3 2 4 2 3 2 2 3 0 3 4 0 1 3 1 0 4 0 2 3 2 3 0 4 0 2 2 2 1 3
 0 0 1 0 1 3 0 2 2 3 1 1 0 3 0 4 2 0 0 1 2 1 2 1 2 0 4 1 1 0 4 1 3 0 2 4 2
 3 4 3 0 2 3 1 0 1 3 0 3 4 1 4 4 2 0 0 4 0 0 3 2 4 3 1 1 0 0 1 4 4 3 3 4 3
 3 2 0 1 1 4 3 2 1 1 0 0 3 2 2 1 3 0 4 3 3 3 3 2 1 3 0 2 0 4 0 3 2 2 0 0 1
 2]
              precision    recall  f1-score   support

           0       0.73      0.81      0.77        54
           1       0.57      0.61      0.59        38
           2       0.81      0.73      0.77        41
           3       0.77      0.78      0.78        51
           4       0.76      0.67      0.71        39

    accuracy                           0.73       223
   macro avg       0.73      0.72      0.72       223
weighted avg       0.73      0.73      0.73       223



In [15]:
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score

precision=precision_score(Y_test_gold, Y_text_predictions, average='macro')
recall=recall_score(Y_test_gold, Y_text_predictions, average='macro')
f1=f1_score(Y_test_gold, Y_text_predictions, average='macro')
accuracy=accuracy_score(Y_test_gold, Y_text_predictions)

print ("Precision: "+str(round(precision,3)))
print ("Recall: "+str(round(recall,3)))
print ("F1-Score: "+str(round(f1,3)))
print ("Accuracy: "+str(round(accuracy,3)))

Precision: 0.731
Recall: 0.721
F1-Score: 0.724
Accuracy: 0.731


Use development set to tune our model.

In [None]:
Y_dev=[]
for instance in new_dev_set:
    Y_dev.append(instance[1])

Y_dev_gold=np.asarray(Y_dev)

# train our three models with the different number of features, and test each of them in the dev set

list_num_features=[10000,12500,15000,17500]
best_accuracy_dev=0.0
for num_features in list_num_features:
  # First, we get the vocabulary from the training set and train our svm classifier
    vocabulary = get_vocabulary(dataset_full, num_features)
    svm_clf, fs_sentanalysis, mfs_sentanalysis = train_svm_classifier(
        new_train_set, vocabulary, round(num_features/3))


  # Then, we transform our dev set into vectors and make the prediction on this set
    X_dev=[]
    for instance in new_dev_set:
        vector_instance=get_vector_text(vocabulary,instance[0])
        X_dev.append(vector_instance)
    X_dev=np.asarray(X_dev)
    
    Y_dev_predictions = svm_clf.predict(mfs_sentanalysis.transform(fs_sentanalysis.transform(X_dev)))
  # Finally, we get the accuracy results of the classifier
    accuracy_dev=accuracy_score(Y_dev_gold, Y_dev_predictions)
    print ("Accuracy with "+str(num_features)+": "+str(round(accuracy_dev,3)))
    if accuracy_dev>=best_accuracy_dev:
        best_accuracy_dev=accuracy_dev
        best_num_features=num_features
        best_vocabulary=vocabulary
        best_svm_clf=svm_clf
print ("\n Best accuracy overall in the dev set is "+str(round(best_accuracy_dev,3))+" with "+str(best_num_features)+" features.")

## E)SVM TRAINING MODEL

---

Do the dimensionality reduction process before training the SVM classifier and combine the feature extraction.

In [None]:
def train_svm_classifier_combine(training_set, vocabulary, chi_num): # Function for training our svm classifier
    X_train=[]
    Y_train=[]
    for instance in training_set:
#         bigram function unfortunately broke down, still can't fix it
        vector_fre = get_vector_text(vocabulary,instance[0])# get frequency vector
        vector_hash = get_hash_vec(instance[0])# get hash vector
        vec_mul = append_list(vector_hash, vector_fre)# join vectors together
        vec_mul = minmax_scale(vec_mul)# minmax scale the vector
        
        X_train.append(vec_mul)
        
        Y_train.append(instance[1])
    
    X_train = np.asarray(X_train)
    Y_train = np.asarray(Y_train)
    # turn 3Dim into 2Dim matrix
    nsamples, nx, ny = X_train.shape
    d2_X_train = X_train.reshape((nsamples,nx*ny))
    
    fs_sentanalysis, X_train_chi_vector = chi_square_select(d2_X_train,Y_train,chi_num)# chi-square 
    
    
  # Finally, we train the SVM classifier 
    svm_clf=sklearn.svm.SVC(kernel="linear",gamma='auto')
    svm_clf.fit(X_train_chi_vector,Y_train)
    return svm_clf, fs_sentanalysis




vocabulary = get_vocabulary(new_train_set, 1000)
svm_clf, fs_sentanalysis = train_svm_classifier_combine(new_train_set, vocabulary,500)