<a href="https://colab.research.google.com/github/SrLacia/CMT316-Coursework-1/blob/main/CMT316_Coursework_1_Part_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [101]:
# Imports
import numpy as np
import nltk
import sklearn
import operator
import requests
import os
import random
import re

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

The first step is to read the data

In [102]:
# Read Data
from google.colab import drive
drive.mount('/content/drive/')

# Tech Articles
path = '/content/drive/My Drive/ML_Projects/datasets_coursework1/bbc/tech'
files = os.listdir(path)
techArticles = []
for file in files:
  techArticles.append(open(path + '/' + file).read())

# Sport Articles
path = '/content/drive/My Drive/ML_Projects/datasets_coursework1/bbc/sport'
files = os.listdir(path)
sportArticles = []
for file in files:
  sportArticles.append(open(path + '/' + file).read())

# Politics Articles
path = '/content/drive/My Drive/ML_Projects/datasets_coursework1/bbc/politics'
files = os.listdir(path)
politicsArticles = []
for file in files:
  politicsArticles.append(open(path + '/' + file).read())

# Entertainment Articles
path = '/content/drive/My Drive/ML_Projects/datasets_coursework1/bbc/entertainment'
files = os.listdir(path)
entertainmentArticles = []
for file in files:
  entertainmentArticles.append(open(path + '/' + file).read())

# Business Articles
path = '/content/drive/My Drive/ML_Projects/datasets_coursework1/bbc/business'
files = os.listdir(path)
businessArticles = []
for file in files:
  businessArticles.append(open(path + '/' + file).read())


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


Now the data needs to go through pre-processing

In [103]:
lemmatizer = nltk.stem.WordNetLemmatizer()

# First, we get the stopwords list from nltk
stopwords=nltk.corpus.stopwords.words('english')
# We can add more words to the stopword list, like punctuation marks
extraStopwords = [".",",","''","-","'","'s","n't","``","%","(",")","said","also","get","says","many","mr","mrs","miss",":",";","'m"]
stopwords.extend(extraStopwords)

def getPrelimTokens(articles):
  tempTokens = []
  for article in articles:
    tempTokens.append(nltk.tokenize.word_tokenize(article))

  for article in tempTokens:
    lemma = []
    for token in article:
      lemma.append(lemmatizer.lemmatize(token))
    article = lemma
  tempTokens = [[token.lower() for token in article] for article in tempTokens]

  tempFreq={}
  for article in tempTokens:
    for token in article:
      if token in stopwords:
        continue
      elif token in tempFreq:
        tempFreq[token] += 1
      else:
        tempFreq[token] = 1

  return sorted(tempFreq.items(), key=operator.itemgetter(1), reverse=True)[:1000]

# Tech
sortedTechFreq = getPrelimTokens(techArticles)

# Sport Articles
sortedSportFreq = getPrelimTokens(sportArticles)

# Politics Articles
sortedPoliticsFreq = getPrelimTokens(politicsArticles)

# Entertainment Articles
sortedEntertainmentFreq = getPrelimTokens(entertainmentArticles)

# Business Articles
sortedBusinessFreq = getPrelimTokens(businessArticles)

# Overall
overallFrequencies = {}
tempList = sortedTechFreq + sortedBusinessFreq + sortedSportFreq + sortedEntertainmentFreq + sortedPoliticsFreq

for word,frequency in tempList:
  if word in overallFrequencies:
    overallFrequencies[word] += int(frequency)
  else:
    overallFrequencies[word] = int(frequency)

sortedOverallFreq = sorted(overallFrequencies.items(), key=operator.itemgetter(1), reverse=True)

# Assign vocabulary using the overal frequency list
vocab=[]
for word,frequency in sortedOverallFreq:
  vocab.append(word)

print(len(vocab))

2674


Using the vocabulary that has been formed by combining the 1000 most common tokens from each of the five catagories we can now conver the articles to vectors

We are using three different features:


1. Absolute Frequency of the words present in the defined vocabulary
2. Average Sentence Length
3. Proportion of words inside quotes



In [104]:
# Function adapted from the exercise '1_Introduction_DataPreprocessing_NLTK_Numpy'
def get_list_tokens(string):
  tokens = nltk.tokenize.word_tokenize(string)
  for token in tokens:
    token = lemmatizer.lemmatize(token)
  tokens = [token.lower() for token in tokens]
  return tokens

# Function adapted from the exercise '2_FeatureEngineeringSelection_Sklearn'
def get_vector_text(list_vocab,string):
  # Absolute frequency
  vector_text=np.zeros(len(list_vocab)+2)
  list_tokens_string=get_list_tokens(string)
  for i, word in enumerate(list_vocab):
    if word in list_tokens_string:
      vector_text[i]=list_tokens_string.count(word)

  # Average Sentence Length
  avgSenLen = len(list_tokens_string) / len(nltk.tokenize.sent_tokenize(string))
  vector_text[-2] = avgSenLen

  # Relative Number of Words inside Quotes
  quotes = re.findall("\"(.*?)\"|\'(.*?)\'", string)
  wordsInQuotes = 0
  if not(quotes is None):
    for quote in quotes:
      wordsInQuotes += len(nltk.tokenize.word_tokenize(quote[0]))
  vector_text[-1] = wordsInQuotes / len(list_tokens_string)

  return vector_text

After converting the articles to vectors they are going to be kept seperate so that test data can be extracted evenly across the five classifications
In addition files will be labeled as follows:


1. Tech
2. Sport
3. Politics
4. Entertainment
5. Business



In [105]:
xTrain = []
xTest = []
xDev= []
yTrain = []
yTest = []
yDev = []

def processData(articles, label):
  xTemp = []
  for article in articles:
    vector = get_vector_text(vocab,article)
    xTemp.append(vector)

  # Shuffle and remove 10% for the test data, and 10% for the development data
  random.shuffle(xTemp)
  xTest.extend(xTemp[:len(xTemp)//10])
  yTest.extend([label] * len(xTemp[:len(xTemp)//10]))
  xDev.extend(xTemp[len(xTemp)//10:len(xTemp)//5])
  yDev.extend([label] * len(xTemp[len(xTemp)//10:len(xTemp)//5]))
  xTrain.extend(xTemp[len(xTemp)//5:])
  yTrain.extend([label] * len(xTemp[len(xTemp)//5:]))

# Tech
processData(techArticles, 1)

# Sport
processData(sportArticles, 2)

# Politics
processData(politicsArticles, 3)

# Entertainment
processData(entertainmentArticles, 4)

# Business
processData(businessArticles, 5)

print(len(xTest))
print(len(xDev))
print(len(xTrain))

221
223
1780


Feature Selection and Evaluation of Number of Features and Regularization parameter




In [106]:
xTrainArray=np.asarray(xTrain)
yTrainArray=np.asarray(yTrain)

# Feature Selection
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

from sklearn.metrics import accuracy_score

# Evaluation of Number of Features and Regularization parameter
bestAcc = 0.0
bestParam = 0
bestNumFeatures = 0
for num in range (1,5):
  featureSelection=SelectKBest(chi2, k=int(num*250)).fit(xTrainArray, yTrainArray)
  xTrainNew = featureSelection.transform(xTrain)
  print("Number of Features: " + str(int(num*250)))
  for param in range(1,21):
    svmClass=sklearn.svm.SVC(kernel="rbf",gamma='scale',C=(param/10)) # Initialize the SVM model
    svmClass.fit(xTrainNew,yTrainArray) # Train the SVM model
    tempPredictions = svmClass.predict(featureSelection.transform(xDev))
    print(str(param/10) + "\t:\t" + str(accuracy_score(yDev, tempPredictions)))
    if accuracy_score(yDev, tempPredictions) > bestAcc:
      bestNumFeatures = int(num*250)
      bestAcc = accuracy_score(yDev, tempPredictions)
      bestParam = param/10

print("Best number of features: " + str(bestNumFeatures))
print("Best Value for C Regularization Parameter: " + str(bestParam))
print("Best Accuracy Score: " + str(bestAcc))

Number of Features: 250
0.1	:	0.8475336322869955
0.2	:	0.9013452914798207
0.3	:	0.9192825112107623
0.4	:	0.9282511210762332
0.5	:	0.9327354260089686
0.6	:	0.9417040358744395
0.7	:	0.9506726457399103
0.8	:	0.9506726457399103
0.9	:	0.9506726457399103
1.0	:	0.9506726457399103
1.1	:	0.9506726457399103
1.2	:	0.9506726457399103
1.3	:	0.9551569506726457
1.4	:	0.9551569506726457
1.5	:	0.9551569506726457
1.6	:	0.9551569506726457
1.7	:	0.9551569506726457
1.8	:	0.9551569506726457
1.9	:	0.9596412556053812
2.0	:	0.9596412556053812
Number of Features: 500
0.1	:	0.8565022421524664
0.2	:	0.9192825112107623
0.3	:	0.9327354260089686
0.4	:	0.9551569506726457
0.5	:	0.9551569506726457
0.6	:	0.9506726457399103
0.7	:	0.9506726457399103
0.8	:	0.9551569506726457
0.9	:	0.9551569506726457
1.0	:	0.9551569506726457
1.1	:	0.9551569506726457
1.2	:	0.9596412556053812
1.3	:	0.9596412556053812
1.4	:	0.9596412556053812
1.5	:	0.9596412556053812
1.6	:	0.9551569506726457
1.7	:	0.9551569506726457
1.8	:	0.9551569506726457
1.

Train Final Model

In [107]:
# Uncomment these lines to use the parameter outlined in the report
bestNumFeatures = 1000
bestParam = 1.3

# Train model using optimal parameters
featureSelection=SelectKBest(chi2, k=bestNumFeatures).fit(xTrainArray, yTrainArray)
xTrainNew = featureSelection.transform(xTrain)
svmClass=sklearn.svm.SVC(kernel="rbf",gamma='scale',C=bestParam) # Initialize the SVM model
svmClass.fit(xTrainNew,yTrainArray) # Train the SVM model

Final Results

In [108]:
# Testing Classification
yPredictions = svmClass.predict(featureSelection.transform(xTest))

print(accuracy_score(yTest, yPredictions))

from sklearn.metrics import classification_report
print("\n", classification_report(yTest, yPredictions))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(yTest, yPredictions))

0.9683257918552036

               precision    recall  f1-score   support

           1       0.93      1.00      0.96        40
           2       1.00      1.00      1.00        51
           3       0.90      0.93      0.92        41
           4       1.00      1.00      1.00        38
           5       1.00      0.92      0.96        51

    accuracy                           0.97       221
   macro avg       0.97      0.97      0.97       221
weighted avg       0.97      0.97      0.97       221

[[40  0  0  0  0]
 [ 0 51  0  0  0]
 [ 3  0 38  0  0]
 [ 0  0  0 38  0]
 [ 0  0  4  0 47]]
