#### Data Preprocessing


In [1]:
from bs4 import BeautifulSoup
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB



In [2]:
# function to get text from the .html file
def get_text(html_file):
    with open(html_file, 'r', encoding='iso-8859-1') as f:
        soup = BeautifulSoup(f, 'html.parser')
        text = soup.get_text()
        text = text.replace('\n', ' ')
        text = ' '.join(text.split())
    return text

# function to get the bag of words from list of text using count vectorizer
def get_bag_of_words(text_list):
    corpus = []
    for text in text_list:
        text = text.lower()
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        #lemmatisation  
        text = text.split()
        lemmatizer = WordNetLemmatizer()
        text = [lemmatizer.lemmatize(word) for word in text if not word in set(stopwords.words('english'))]
        text = ' '.join(text)
        corpus.append(text)
    vectorizer = CountVectorizer()
    bag_of_words = vectorizer.fit_transform(corpus)
    tfidf_transformer = TfidfTransformer()
    bag_of_words = tfidf_transformer.fit_transform(bag_of_words)
    return bag_of_words

In [3]:
X_fulltext = []
y_fulltext = []
X_inlinks = []
y_inlinks = []


for filename in os.listdir('course-cotrain-data/fulltext/course'):
    X_fulltext.append(get_text('course-cotrain-data/fulltext/course/'+filename))
    y_fulltext.append('course')

for filename in os.listdir('course-cotrain-data/fulltext/non-course'):
    X_fulltext.append(get_text('course-cotrain-data/fulltext/non-course/'+filename))
    y_fulltext.append('noncourse')

for filename in os.listdir('course-cotrain-data/inlinks/course'):
    X_inlinks.append(get_text('course-cotrain-data/inlinks/course/'+filename))
    y_inlinks.append('course')

for filename in os.listdir('course-cotrain-data/inlinks/non-course'):
    X_inlinks.append(get_text('course-cotrain-data/inlinks/non-course/'+filename))
    y_inlinks.append('noncourse')

print(len(X_fulltext))
print(len(y_fulltext))
print(len(X_inlinks))
print(len(y_inlinks))

X_fulltext = get_bag_of_words(X_fulltext)
X_inlinks = get_bag_of_words(X_inlinks)

print(X_fulltext.shape)
print(X_inlinks.shape)

X_fulltext = X_fulltext.toarray()
X_inlinks = X_inlinks.toarray()

y_fulltext = [1 if x=='course' else 0 for x in y_fulltext]
y_inlinks = [1 if x=='course' else 0 for x in y_inlinks]

y_fulltext = np.array(y_fulltext)
y_inlinks = np.array(y_inlinks)

1051
1051
1051
1051
(1051, 14996)
(1051, 1725)


In [4]:
data = {'x':[], 'y':[]}
for i in range(X_fulltext.shape[0]):
    data['x'].append({'fulltext':X_fulltext[i],'inlinks':X_inlinks[i]})
    data['y'].append(y_fulltext[i])

print(data['x'][0]['fulltext'].shape)
print(data['x'][0]['inlinks'].shape)

print(data['y'][0])

(14996,)
(1725,)
1


In [5]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(data['x'], data['y'], test_size=0.2, random_state=0,shuffle=True)

In [6]:
# diving X_train into L and U : L has 9 non-course(0) and 3 course(1) and U has the rest
X_train_L = []
X_train_L1 = []
X_train_U = []
y_train_L = []
y_train_L1 = []
y_train_U = []

count_0 = 0
count_1 = 0

for i in range(len(X_train)):
    if y_train[i] == 0 and count_0 < 9:
        X_train_L.append(X_train[i])
        X_train_L1.append(X_train[i])
        y_train_L.append(y_train[i])
        y_train_L1.append(y_train[i])
        count_0 += 1
    elif y_train[i] == 1 and count_1 < 3:
        X_train_L.append(X_train[i])
        X_train_L1.append(X_train[i])
        y_train_L.append(y_train[i])
        y_train_L1.append(y_train[i])
        count_1 += 1
    else:
        X_train_U.append(X_train[i])
        y_train_U.append(y_train[i])
    
     
print(len(X_train_L))
print(len(X_train_U))
print(len(y_train_L))
print(len(y_train_U))

print(y_train_L.count(0))
print(y_train_L.count(1))
print(y_train_U.count(0))
print(y_train_U.count(1))

12
828
12
828
9
3
632
196


In [7]:
# cotraining algorithm
def cotrain(p,n,k,u,X_train_L,X_train_U,y_train_L,y_train_U):
    
    for _ in range(k):
        # Intiliazing Multinomial Naive Bayes classifiers for each view 
        clf_fulltext = MultinomialNB()
        clf_inlinks = MultinomialNB()

        fulltext_t = []
        inlinks_t = []

        for i in range(len(X_train_L)):
            fulltext_t.append(X_train_L[i]['fulltext'])
            inlinks_t.append(X_train_L[i]['inlinks'])


        # Training the classifiers on the labeled data
        clf_fulltext.fit(fulltext_t, y_train_L)
        clf_inlinks.fit(inlinks_t, y_train_L)

        # randomly select u examples from U
        X_train_U_sample = []
        y_train_U_sample = []

        for i in range(u):
            # randomly taking u unique examples from U
            index = np.random.randint(len(X_train_U))
            while index in X_train_U_sample:
                index = np.random.randint(len(X_train_U))
            X_train_U_sample.append(X_train_U[index])
            y_train_U_sample.append(y_train_U[index])

        X_train_U_sample_fulltext = []
        X_train_U_sample_inlinks = []

        for i in range(len(X_train_U_sample)):
            X_train_U_sample_fulltext.append(X_train_U_sample[i]['fulltext'])
            X_train_U_sample_inlinks.append(X_train_U_sample[i]['inlinks'])
        
        y_pred_fulltext = clf_fulltext.predict_proba(X_train_U_sample_fulltext)
        y_pred_inlinks = clf_inlinks.predict_proba(X_train_U_sample_inlinks)

        # adding the most confident n negative examples to L
        for i in range(n):
            index = np.argmax(y_pred_fulltext[:,0])
            X_train_L.append(X_train_U_sample[index])
            y_train_L.append(y_train_U_sample[index])
            y_pred_fulltext = np.delete(y_pred_fulltext,index,0)
            X_train_U_sample = np.delete(X_train_U_sample,index,0)
            y_train_U_sample = np.delete(y_train_U_sample,index,0)
            y_pred_inlinks = np.delete(y_pred_inlinks,index,0)

        
        # adding the most confident p positive examples to L
        for i in range(p):
            index = np.argmax(y_pred_fulltext[:,1])
            X_train_L.append(X_train_U_sample[index])
            y_train_L.append(y_train_U_sample[index])
            y_pred_fulltext = np.delete(y_pred_fulltext,index,0)
            X_train_U_sample = np.delete(X_train_U_sample,index,0)
            y_train_U_sample = np.delete(y_train_U_sample,index,0)
            y_pred_inlinks = np.delete(y_pred_inlinks,index,0)

        # adding the most confident n negative examples to L
        for i in range(n):
            index = np.argmax(y_pred_inlinks[:,0])
            X_train_L.append(X_train_U_sample[index])
            y_train_L.append(y_train_U_sample[index])            
            y_pred_inlinks = np.delete(y_pred_inlinks,index,0)
            X_train_U_sample = np.delete(X_train_U_sample,index,0)
            y_train_U_sample = np.delete(y_train_U_sample,index,0)

        # adding the most confident p positive examples to L
        for i in range(p):
            index = np.argmax(y_pred_inlinks[:,1])
            X_train_L.append(X_train_U_sample[index])
            y_train_L.append(y_train_U_sample[index])
            y_pred_inlinks = np.delete(y_pred_inlinks,index,0)
            X_train_U_sample = np.delete(X_train_U_sample,index,0)
            y_train_U_sample = np.delete(y_train_U_sample,index,0)
            
    return clf_fulltext, clf_inlinks

In [8]:
# Getting the trained classifiers
clf_fulltext, clf_inlinks = cotrain(1,3,30,75,X_train_L,X_train_U,y_train_L,y_train_U)

In [9]:
# Testing the classifiers
X_test_fulltext = []
X_test_inlinks = []

for i in range(len(X_test)):
    X_test_fulltext.append(X_test[i]['fulltext'])
    X_test_inlinks.append(X_test[i]['inlinks'])

y_pred_fulltext = clf_fulltext.predict_proba(X_test_fulltext)
y_pred_inlinks = clf_inlinks.predict_proba(X_test_inlinks)

# calculating the accuracy by taking the maximum of the probabilities of the two classifiers

y_pred = []
for i in range(len(y_pred_fulltext)):
    # maximum probability from fulltext classifier
    max_prob1 = max(y_pred_fulltext[i])
    index1 = np.argmax(y_pred_fulltext[i])
    # maximum probability from inlinks classifier
    max_prob2 = max(y_pred_inlinks[i])
    index2 = np.argmax(y_pred_inlinks[i])
    # taking the maximum of the two probabilities
    if max_prob1 > max_prob2:
        y_pred.append(index1)
    else:
        y_pred.append(index2)

# calculating the accuracy
count = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_test[i]:
        count += 1

print("Accuracy: ")
print(count/len(y_pred_fulltext))

Accuracy: 
0.9241706161137441


In [10]:
# supervised training
# Intiliazing Multinomial Naive Bayes classifiers for each view
clf_fulltext_sup = MultinomialNB()
clf_inlinks_sup = MultinomialNB()

fulltext_t = []
inlinks_t = []

for i in range(len(X_train_L1)):
    fulltext_t.append(X_train_L1[i]['fulltext'])
    inlinks_t.append(X_train_L1[i]['inlinks'])

# Training the classifiers on the labeled data
clf_fulltext_sup.fit(fulltext_t, y_train_L1)
clf_inlinks_sup.fit(inlinks_t, y_train_L1)

X_test_fulltext = []
X_test_inlinks = []

for i in range(len(X_test)):
    X_test_fulltext.append(X_test[i]['fulltext'])
    X_test_inlinks.append(X_test[i]['inlinks'])

y_pred_fulltext = clf_fulltext_sup.predict_proba(X_test_fulltext)
y_pred_inlinks = clf_inlinks_sup.predict_proba(X_test_inlinks)

y_pred= []
# Calculating the accuracy
count = 0
for i in range(len(y_pred_fulltext)):
    # maximum probability from fulltext classifier
    max_prob1 = max(y_pred_fulltext[i])
    index1 = np.argmax(y_pred_fulltext[i])
    # maximum probability from inlinks classifier
    max_prob2 = max(y_pred_inlinks[i])
    index2 = np.argmax(y_pred_inlinks[i])
    # taking the maximum of the two probabilities
    if max_prob1 > max_prob2:
        y_pred.append(index1)
    else:
        y_pred.append(index2)
    if y_pred[i] == y_test[i]:
        count += 1

print("Accuracy: ")
print(count/len(y_pred_fulltext))

Accuracy: 
0.8530805687203792
