### DATA PREPROCESSING

In [17]:
from bs4 import BeautifulSoup
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [18]:
# function to get text from the .html file
def get_text(html_file):
    with open(html_file, 'r', encoding='iso-8859-1') as f:
        soup = BeautifulSoup(f, 'html.parser')
        text = soup.get_text()
        text = text.replace('\n', ' ')
        text = ' '.join(text.split())
    return text

# function to get the bag of words from list of text using count vectorizer
def get_bag_of_words(text_list):
    corpus = []
    for text in text_list:
        text = text.lower()
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        #lemmatisation  
        text = text.split()
        lemmatizer = WordNetLemmatizer()
        text = [lemmatizer.lemmatize(word) for word in text if not word in set(stopwords.words('english'))]
        text = ' '.join(text)
        corpus.append(text)
    vectorizer = CountVectorizer()
    bag_of_words = vectorizer.fit_transform(corpus)
    tfidf_transformer = TfidfTransformer()
    bag_of_words = tfidf_transformer.fit_transform(bag_of_words)
    return bag_of_words

In [19]:
X_fulltext = []
Y_fulltext = []
X_inlink = []
Y_inlink = []

for filename in os.listdir('course-cotrain-data/fulltext/non-course'):
    for folder in os.listdir('webkb'):
        if folder == 'department':
            continue
        for folder1 in os.listdir('webkb/' + folder):
            if filename in os.listdir('webkb/' + folder + '/' + folder1):
                Y_fulltext.append(folder)
                X_fulltext.append(get_text('webkb/' + folder + '/' + folder1 + '/' + filename))
                break
            else:
                continue

for filename in os.listdir('course-cotrain-data/inlinks/non-course'):
    for folder in os.listdir('webkb'):
        if folder == 'department':
            continue
        for folder1 in os.listdir('webkb/' + folder):
            if filename in os.listdir('webkb/' + folder + '/' + folder1):
                Y_inlink.append(folder)
                X_inlink.append(get_text('course-cotrain-data/inlinks/non-course/'+filename))
                break
            else:
                continue

for filename in os.listdir('course-cotrain-data/fulltext/course'):
    X_fulltext.append(get_text('course-cotrain-data/fulltext/course/'+filename))
    Y_fulltext.append('course')

for filename in os.listdir('course-cotrain-data/inlinks/course'):
    X_inlink.append(get_text('course-cotrain-data/inlinks/course/'+filename))
    Y_inlink.append('course')   

In [20]:
X_fulltext = get_bag_of_words(X_fulltext)
X_inlink = get_bag_of_words(X_inlink)

X_fulltext = X_fulltext.toarray()
X_inlink = X_inlink.toarray()

# label encoding
labelencoder = LabelEncoder()
Y_fulltext = labelencoder.fit_transform(Y_fulltext)
Y_inlink = labelencoder.fit_transform(Y_inlink)

Y_fulltext = np.array(Y_fulltext)
Y_inlink = np.array(Y_inlink)

In [21]:
data = {'x':[], 'y':[]}
for i in range(len(X_fulltext)):
    data['x'].append({'fulltext':X_fulltext[i], 'inlinks':X_inlink[i]})
    data['y'].append(Y_fulltext[i])

print(data['x'][0]['fulltext'].shape)
print(data['x'][0]['inlinks'].shape)
print(data['y'][0])

(14995,)
(1682,)
3


In [22]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(data['x'], data['y'], test_size=0.2, random_state=0,shuffle=True)

In [23]:
# diving X_train into L and U : L has 5 0's, 3 1's, 2 2's, 1 3's, 12 4's
X_train_L = []
X_train_U = []
y_train_L = []
y_train_U = []
X_train_L1 = []
y_train_L1 = []

count_0 = 0
count_1 = 0
count_2 = 0
count_3 = 0
count_4 = 0

for i in range(len(X_train)):
    if y_train[i] == 0 and count_0 < 5:
        X_train_L.append(X_train[i])
        y_train_L.append(y_train[i])
        X_train_L1.append(X_train[i])
        y_train_L1.append(y_train[i])
        count_0 += 1
    elif y_train[i] == 1 and count_1 < 3:
        X_train_L.append(X_train[i])
        y_train_L.append(y_train[i])
        X_train_L1.append(X_train[i])
        y_train_L1.append(y_train[i])
        count_1 += 1
    elif y_train[i] == 2 and count_2 < 2:
        X_train_L.append(X_train[i])
        y_train_L.append(y_train[i])
        X_train_L1.append(X_train[i])
        y_train_L1.append(y_train[i])
        count_2 += 1
    elif y_train[i] == 3 and count_3 < 1:
        X_train_L.append(X_train[i])
        y_train_L.append(y_train[i])
        X_train_L1.append(X_train[i])
        y_train_L1.append(y_train[i])
        count_3 += 1
    elif y_train[i] == 4 and count_4 < 12:
        X_train_L.append(X_train[i])
        y_train_L.append(y_train[i])
        X_train_L1.append(X_train[i])
        y_train_L1.append(y_train[i])
        count_4 += 1
    else:
        X_train_U.append(X_train[i])
        y_train_U.append(y_train[i])

In [24]:
def cotrain(a,b,c,d,e,k,u,X_train_L,X_train_U,y_train_L,y_train_U):
    X_train_L__fulltext = X_train_L
    X_train_L__inlinks = X_train_L
    y_train_L__fulltext = y_train_L
    y_train_L__inlinks = y_train_L
    for _ in range(k):
        clf_fulltext = MultinomialNB()
        clf_inlinks = MultinomialNB()
        fulltext_t = []
        inlinks_t = []

        for i in range(len(X_train_L__fulltext)):
            fulltext_t.append(X_train_L__fulltext[i]['fulltext'])
        
        for i in range(len(X_train_L__inlinks)):
            inlinks_t.append(X_train_L__inlinks[i]['inlinks'])
        
        clf_fulltext.fit(fulltext_t,y_train_L__fulltext)
        clf_inlinks.fit(inlinks_t,y_train_L__inlinks)

        # randomly select u examples from U
        X_train_U_sample_f = []
        y_train_U_sample_f = []
        X_train_U_sample_i = []
        y_train_U_sample_i = []

        for i in range(u):
            # randomly taking u unique examples from U
            index = np.random.randint(len(X_train_U))
            while index in X_train_U_sample_f:
                index = np.random.randint(len(X_train_U))
            X_train_U_sample_f.append(X_train_U[index])
            y_train_U_sample_f.append(y_train_U[index])
            X_train_U_sample_i.append(X_train_U[index])
            y_train_U_sample_i.append(y_train_U[index])
        
        X_train_U_sample_fulltext = []
        X_train_U_sample_inlinks = []

        for i in range(len(X_train_U_sample_f)):
            X_train_U_sample_fulltext.append(X_train_U_sample_f[i]['fulltext'])
            X_train_U_sample_inlinks.append(X_train_U_sample_f[i]['inlinks'])
        
        y_pred_fulltext = clf_fulltext.predict_proba(X_train_U_sample_fulltext)
        y_pred_inlinks = clf_inlinks.predict_proba(X_train_U_sample_inlinks)

        # adding the most confident a 0's, b 1's, c 2's, d 3's, e 4's to X_train_L__fulltext and X_train_L__inlinks

        arr = [a,b,c,d,e]
        for i in range(len(arr)):
            for j in range(arr[i]):
                index = np.argmax(y_pred_fulltext[:,i])
                X_train_L__inlinks.append(X_train_U_sample_f[index])
                y_train_L__inlinks.append(y_train_U_sample_f[index])
                y_pred_fulltext = np.delete(y_pred_fulltext,index,axis=0)
                X_train_U_sample_f = np.delete(X_train_U_sample_f,index,axis=0)
                y_train_U_sample_f = np.delete(y_train_U_sample_f,index,axis=0)

        for i in range(len(arr)):
            for j in range(arr[i]):
                index = np.argmax(y_pred_inlinks[:,i])
                X_train_L__fulltext.append(X_train_U_sample_i[index])
                y_train_L__fulltext.append(y_train_U_sample_i[index])
                y_pred_inlinks = np.delete(y_pred_inlinks,index,axis=0)
                X_train_U_sample_i = np.delete(X_train_U_sample_i,index,axis=0)
                y_train_U_sample_i = np.delete(y_train_U_sample_i,index,axis=0)

    return clf_fulltext,clf_inlinks        


In [25]:
# Getting the trained classifiers
clf_fulltext,clf_inlinks = cotrain(5,3,2,1,12,10,150,X_train_L,X_train_U,y_train_L,y_train_U)

In [30]:
# Testing the classifiers
X_test_fulltext = []
X_test_inlinks = []

for i in range(len(X_test)):
    X_test_fulltext.append(X_test[i]['fulltext'])
    X_test_inlinks.append(X_test[i]['inlinks'])

y_pred_fulltext = clf_fulltext.predict_proba(X_test_fulltext)
y_pred_inlinks = clf_inlinks.predict_proba(X_test_inlinks)

# calculating the accuracy by taking the maximum of the probabilities of the two classifiers
y_pred = []
for i in range(len(y_pred_fulltext)):
    # maximum probability from fulltext classifier
    max_prob1 = np.max(y_pred_fulltext[i])
    index1 = np.argmax(y_pred_fulltext[i])
    # maximum probability from inlinks classifier
    max_prob2 = np.max(y_pred_inlinks[i])
    index2 = np.argmax(y_pred_inlinks[i])
    if max_prob1 > max_prob2:
        y_pred.append(index1)
    else:
        y_pred.append(index2)
    
y_pred = np.array(y_pred)

# calculating the accuracy
count = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_test[i]:
        count += 1

print('Accuracy:',count/len(y_pred))

Accuracy: 0.6619047619047619


In [31]:
# supervised training
# Intiliazing Multinomial Naive Bayes classifiers for each view
clf_fulltext_sup = MultinomialNB()
clf_inlinks_sup = MultinomialNB()

fulltext_t = []
inlinks_t = []

for i in range(len(X_train_L1)):
    fulltext_t.append(X_train_L1[i]['fulltext'])
    inlinks_t.append(X_train_L1[i]['inlinks'])

# Training the classifiers on the labeled data
clf_fulltext_sup.fit(fulltext_t, y_train_L1)
clf_inlinks_sup.fit(inlinks_t, y_train_L1)

y_pred_fulltext = clf_fulltext_sup.predict_proba(X_test_fulltext)
y_pred_inlinks = clf_inlinks_sup.predict_proba(X_test_inlinks)

# calculating the accuracy by taking the maximum of the probabilities of the two classifiers
y_pred = []
for i in range(len(y_pred_fulltext)):
    # maximum probability from fulltext classifier
    max_prob1 = np.max(y_pred_fulltext[i])
    index1 = np.argmax(y_pred_fulltext[i])
    # maximum probability from inlinks classifier
    max_prob2 = np.max(y_pred_inlinks[i])
    index2 = np.argmax(y_pred_inlinks[i])
    if max_prob1 > max_prob2:
        y_pred.append(index1)
    else:
        y_pred.append(index2)
    
y_pred = np.array(y_pred)

# calculating the accuracy
count = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_test[i]:
        count += 1

print('Accuracy:',count/len(y_pred))

Accuracy: 0.49047619047619045
