Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.naive_bayes import MultinomialNB #this is for the model type
from sklearn import metrics
from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

Reading the scraped dataset and preparing it for modelling

In [3]:
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CRUx/Task 1/random articles cleaned.csv")

convert_dict = {'title': str,
                'summary': str,
                'category': int
                }
dataset = dataset.astype(convert_dict)

dataset.head()

Unnamed: 0,title,summary,category
0,Neutral Bay ferry wharf,Neutral Bay ferry wharf is located on Neutral ...,0
1,Mike Hamlin,Mike Hamlin 19352017 was an American labor act...,0
2,The Dead Girl,The Dead Girl is a 2006 American drama thrille...,0
3,List of Important Cultural Properties of Japan...,This list is of Japanese structures dating fro...,0
4,Charles F. Carpentier,Charles Francis Carpentier September 19 1896 ...,0


In [4]:
dataset['category'].value_counts()

-1    1433
 0      33
 1      32
Name: category, dtype: int64

In [5]:
labelled_set, unlabelled_set = [x for _, x in dataset.groupby(dataset['category'] < 0)]

In [6]:
labelled_set['category'].value_counts()

0    33
1    32
Name: category, dtype: int64

In [7]:
unlabelled_set['category'].value_counts()

-1    1433
Name: category, dtype: int64

Splitting the data into train and test data. This will be used to train and evaluate a naive bays classifier using the normal supervised learning approach

In [8]:
X_train, X_test, y_train, y_test = train_test_split(labelled_set['summary'], labelled_set['category'], random_state = 2)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [9]:
#initialising stopwords and lematization
stopword_list = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [10]:
X_train_non, X_test_non = [], []
#pre-processing the summary
for i in range(0, len(X_train)):
  #print(X_train_non[i])
  _summary = re.sub(r'[^a-zA-Z]', ' ', X_train[i])
  _summary = _summary.lower()
  _summary = _summary.split()
  _summary = [lemmatizer.lemmatize(word) for word in _summary if not word in set(stopword_list)]
  _summary = ' '.join(_summary)
  X_train_non.append(_summary)

for i in range(0, len(X_test)):
  _summary = re.sub(r'[^a-zA-Z]', ' ', X_test[i])
  _summary = _summary.lower()
  _summary = _summary.split()
  _summary = [lemmatizer.lemmatize(word) for word in _summary if not word in set(stopword_list)]
  _summary = ' '.join(_summary)
  X_test_non.append(_summary)

In [11]:
X_test_non[7]

'dolichopus tenuipes specie longlegged fly family dolichopodidae reference'

In [12]:
#initialising tfidf vectorizer
tf_idf = TfidfVectorizer()
tfidf_train = tf_idf.fit_transform(X_train_non)
tfidf_test = tf_idf.transform(X_test_non)
print(tfidf_train.shape)
print(tfidf_test.shape)

(48, 1162)
(17, 1162)


In [13]:
#naive bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(tfidf_train, y_train)

#predicted y
y_pred = naive_bayes_classifier.predict(tfidf_test)

print(metrics.classification_report(y_test, y_pred, target_names=['Non STEM', 'STEM']))

              precision    recall  f1-score   support

    Non STEM       1.00      0.80      0.89        10
        STEM       0.78      1.00      0.88         7

    accuracy                           0.88        17
   macro avg       0.89      0.90      0.88        17
weighted avg       0.91      0.88      0.88        17



Now we will train a self-learning model with the base estimator as naive bays. To evaluate the model I am loading another dataset for test purpose

In [14]:
test_dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CRUx/Task 1/testdata.csv")

In [15]:
x_train_SL, x_test_SL = [], []
#pre-processing the summary
for i in range(0, len(dataset['summary'])):
  _summary = re.sub(r'[^a-zA-Z]', ' ', dataset['summary'][i])
  _summary = _summary.lower()
  _summary = _summary.split()
  _summary = [lemmatizer.lemmatize(word) for word in _summary if not word in set(stopword_list)]
  _summary = ' '.join(_summary)
  x_train_SL.append(_summary)

for i in range(0, len(test_dataset['summary'])):
  _summary = re.sub(r'[^a-zA-Z]', ' ', test_dataset['summary'][i])
  _summary = _summary.lower()
  _summary = _summary.split()
  _summary = [lemmatizer.lemmatize(word) for word in _summary if not word in set(stopword_list)]
  _summary = ' '.join(_summary)
  x_test_SL.append(_summary)

In [16]:
x_train_SL[1]

'mike hamlin american labor activist social workerhamlin born mississippi moved ecorse michigan outside detroit hamlin featured documentary finally got news document formation movement practice philosophy league revolutionary black worker lrbw marxistleninist organization black worker detroit cofounded'

In [18]:
x_train_SL = tf_idf.fit_transform(x_train_SL)
x_test_SL = tf_idf.transform(x_test_SL)
print(x_train_SL.shape)
print(x_test_SL.shape)

(1498, 16540)
(691, 16540)


Using Bays model as the base estimator, we create a selftraining classifier that is present in sklearn library

In [19]:
naive_bays_selflearn = MultinomialNB()
self_training_model = SelfTrainingClassifier(naive_bays_selflearn)
self_training_model.fit(x_train_SL,dataset['category'])

In [20]:
y_pred_SL = self_training_model.predict(x_test_SL)

print(metrics.classification_report(test_dataset['category'], y_pred_SL, target_names=['Non STEM', 'STEM']))

              precision    recall  f1-score   support

    Non STEM       0.49      0.83      0.62       266
        STEM       0.81      0.46      0.58       425

    accuracy                           0.60       691
   macro avg       0.65      0.64      0.60       691
weighted avg       0.69      0.60      0.60       691

