In [1]:
import pandas as pd
import nltk
import re
import numpy as np
import matplotlib.pyplot as plt

In [35]:
df_train = pd.read_csv('./datasets/webkb-train-stemmed.txt', header=None, sep='\t', names=['category', 'text'])
df_train = df_train.dropna()
df_train.head()

Unnamed: 0,category,text
0,student,brian comput scienc depart univers wisconsin d...
1,student,denni swanson web page mail pop uki offic hour...
2,faculty,russel impagliazzo depart comput scienc engin ...
3,student,dave phd student depart comput scienc univers ...
4,project,center lifelong learn design univers colorado ...


In [3]:
text_train = df_train.drop(['category'],axis=1)
y_train = df_train['category']

text_train, y_train

(                                                   text
 0     brian comput scienc depart univers wisconsin d...
 1     denni swanson web page mail pop uki offic hour...
 2     russel impagliazzo depart comput scienc engin ...
 3     dave phd student depart comput scienc univers ...
 4     center lifelong learn design univers colorado ...
 ...                                                 ...
 2798  faster harder kill laboratori experiment softw...
 2799  previou content steven faculti research guid p...
 2800  sandeep graduat student studi comput scienc cl...
 2801  web oper system uniqu mwf tai recent explos in...
 2802  rami melhem professor dept comput scienc phone...
 
 [2785 rows x 1 columns],
 0       student
 1       student
 2       faculty
 3       student
 4       project
          ...   
 2798    project
 2799    faculty
 2800    student
 2801     course
 2802    faculty
 Name: category, Length: 2785, dtype: object)

In [4]:
y_train.value_counts()

student    1085
faculty     745
course      620
project     335
Name: category, dtype: int64

In [5]:
df_test = pd.read_csv('datasets/webkb-test-stemmed.txt', header = None, sep='\t', names = ['category', 'text'])
df_test = df_test.dropna()
text_test = df_test.drop(['category'],axis=1)
y_test = df_test['category']

y_test.value_counts()

student    540
faculty    371
course     306
project    166
Name: category, dtype: int64

In [6]:
# Since dataset is already stemmed, only stop-word removal will be applied
def text_preprocess(articles: pd.DataFrame) -> list:
    corpus=[]
    for i in range(0,len(articles)):
        review = re.sub('[^a-zA-Z]', ' ', articles['text'][i]) #replacing all charecters apart from A-Z,a-z with ' '
        review = review.lower()
        review = nltk.word_tokenize(review)

        review = [word for word in review if not word in nltk.corpus.stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [7]:
articles_train = text_train.copy()
articles_train.reset_index(inplace=True)
articles_train

Unnamed: 0,index,text
0,0,brian comput scienc depart univers wisconsin d...
1,1,denni swanson web page mail pop uki offic hour...
2,2,russel impagliazzo depart comput scienc engin ...
3,3,dave phd student depart comput scienc univers ...
4,4,center lifelong learn design univers colorado ...
...,...,...
2780,2798,faster harder kill laboratori experiment softw...
2781,2799,previou content steven faculti research guid p...
2782,2800,sandeep graduat student studi comput scienc cl...
2783,2801,web oper system uniqu mwf tai recent explos in...


In [8]:
articles_test = text_test.copy()
articles_test.reset_index(inplace=True)
articles_test

Unnamed: 0,index,text
0,0,eric homepag eric wei tsinghua physic fudan genet
1,1,comput system perform evalu model new sept ass...
2,2,home page comput scienc grad student ucsd work...
3,3,toni web page toni face thing call toni studen...
4,4,ec advanc comput architectur credit parallel a...
...,...,...
1378,1391,scott pictur background scott phd student coll...
1379,1392,advanc oper system fall marvin offic comput sc...
1380,1393,human robot hand group head kenneth salisburi ...
1381,1394,databas manag system design implement inform p...


In [9]:
X_train = text_preprocess(articles_train)
X_train[0]

'brian comput scienc depart univers wisconsin dayton street madison offic email wisc offic phone home phone advisor david wood tabl content interest schedul summer public interest profession comput architectur oper system compil high speed network distribut parallel system secur account high perform person bicycl walk hike camp travel home brew cook comput electron read schedul mondai wwt meet wednesdai meet david cow meet milwauke brian heidi wed madison comput architectur affili meet chicago base public journal articl foster perform massiv parallel comput spectral atmospher model atmospher ocean technolog byte drake foster design perform scalabl parallel commun climat model parallel comput decemb byte proceed paper foster algorithm comparison benchmark parallel spectral transform water model sixth workshop parallel process meteorolog ed world scientif singapor byte drake foster hack williamson adapt scalabl parallel comput proceed global chang symposium american meteorolog societi by

In [10]:
X_test = text_preprocess(articles_test)
X_test[0]

'eric homepag eric wei tsinghua physic fudan genet'

# Word2Vec

In [11]:
import gensim
from gensim.models import Word2Vec

In [23]:
model = gensim.models.Word2Vec(X_train, vector_size=300, window=5, min_count=1, sg=1, epochs=30)

In [26]:
words = set(model.wv.index_to_key)
X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in words]) for ls in X_train], dtype=object)
X_test_vect = np.array([np.array([model.wv[i] for i in ls if i in words]) for ls in X_test], dtype=object)


In [29]:
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))

X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))


In [33]:
from sklearn.svm import LinearSVC


svm = LinearSVC()
svm_model = svm.fit(X_train_vect_avg, y_train)


In [34]:
y_pred = svm_model.predict(X_test_vect_avg)

In [36]:
from sklearn.metrics import accuracy_score, classification_report, f1_score, matthews_corrcoef


print(classification_report(y_test, y_pred))
print('\nAccuracy: ', accuracy_score(y_test, y_pred))
print('MCC: ', matthews_corrcoef(y_test, y_pred))
print('f1-micro: ', f1_score(y_test, y_pred, average='micro'))

              precision    recall  f1-score   support

      course       0.75      0.02      0.04       306
     faculty       0.48      0.22      0.30       371
     project       0.00      0.00      0.00       166
     student       0.42      0.94      0.58       540

    accuracy                           0.43      1383
   macro avg       0.41      0.30      0.23      1383
weighted avg       0.46      0.43      0.32      1383


Accuracy:  0.4309472161966739
MCC:  0.14089785520608775
f1-micro:  0.4309472161966739


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
