# Importing necessory Libraries

In [1]:
from sklearn.datasets import load_files
import sklearn.svm as svm
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from gensim.models import KeyedVectors
from nltk.corpus import stopwords 
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from gensim.models import Word2Vec

# Loading data

In [2]:
articles=load_files('bbc',encoding='utf-8',decode_error='replace')
X=articles.data
y=articles.target

# Observations

1. Three approaches for document representation were used in this experiment i.e. TFIDF,Word2Vec and pre-computed word embeddings.
2. Data are splited into Training and Test sets in the ratio 70:30.
3. SVM classifier with linear kernel was used. 
4. Pre-computed word embeddings had the best scores followed by Word2vec and then TFIDF. 

# Train Test Split   70:30 Ratio

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# => Using TFIDF Representation

In [4]:
my_stop_words = set(stopwords.words('english'))

#documents were represented by vectors with 1000 features
vectorizer = TfidfVectorizer(norm=None,stop_words=my_stop_words,max_features=1000, decode_error="ignore")

X_train_vectors = vectorizer.fit_transform(X_train)

### Appyling SVM classifier

In [6]:
svm1=svm.SVC(kernel='linear',C=1)
svm1.fit(X_train_vectors,y_train)
y_preds=svm1.predict(vectorizer.transform(X_test))

### Micro and Macro Performance Metrics

In [7]:
p1,r1,f1,pp=precision_recall_fscore_support(y_test,y_preds, average='macro')
p2,r2,f2,pp=precision_recall_fscore_support(y_test, y_preds, average='micro')

# =>Using Word2Vec Representation

### Tokenisation

In [8]:
X_train_tokens=[]
for i in X_train:
    X_train_tokens.append(i.split())
X_test_tokens=[]
for i in X_test:
    X_test_tokens.append(i.split())

### Model generation using Gensim Word2Vec

In [9]:
model = Word2Vec(X_train_tokens,size=300,window=2,sg=1,min_count=1,alpha=0.15)
# input: common_texts
# size: dimension of vector
# window: size of window
# sg: 1 for skip gram, 0 for cbow
# min_count: specify the minimum frequency of words that should be considered
# alpha: learning rate

#model was trained and word vectors had 300 features.
model.train(X_train_tokens,total_examples=len(X_train_tokens),epochs=4)
print('Done')

Done


### Generation of document vectors for train and test data

In [10]:
X_train_vectors1=[]
for i in X_train_tokens:
    t=[0 for x in range(300)]
    for j in i:
        try:
            v=model[j]
            t=np.add(t,v)
        except:
            pass
    X_train_vectors1.append(np.divide(t,len(i)))

X_test_vectors1=[]
for i in X_test_tokens:
    t=[0 for x in range(300)]
    for j in i:
        try:
            v=model[j]
            t=np.add(t,v)
        except:
            pass
    X_test_vectors1.append(np.divide(t,len(i)))
        

  


### Applying SVM Classifier

In [13]:
svm2=svm.SVC(kernel='linear',C=1)
svm2.fit(X_train_vectors1,y_train)
y_preds2=svm2.predict(X_test_vectors1)

### Micro and Macro Performance Metrics

In [14]:
p3,r3,f3,pp=precision_recall_fscore_support(y_test, y_preds2, average='macro')
p4,r4,f4,pp=precision_recall_fscore_support(y_test, y_preds2, average='micro')

# =>Using Pre-computed Word Embeddings

In [15]:
#loading pre-computed word embeddings from google news data
model2 = KeyedVectors.load_word2vec_format('../Assignment 7/GoogleNews-vectors-negative300.bin', binary=True)

### Generation of document vectors for train and test data

In [16]:
X_train_vectors2=[]
for i in X_train_tokens:
    t=[0 for x in range(300)]
    for j in i:
        try:
            v=model2[j]
            t=np.add(t,v)
        except:
            pass
    X_train_vectors2.append(np.divide(t,len(i)))

X_test_vectors2=[]
for i in X_test_tokens:
    t=[0 for x in range(300)]
    for j in i:
        try:
            v=model2[j]
            t=np.add(t,v)
        except:
            pass
    X_test_vectors2.append(np.divide(t,len(i)))

### Appyling SVM classifer

In [17]:
svm3=svm.SVC(kernel='linear',C=1)
svm3.fit(X_train_vectors2,y_train)
y_preds3=svm3.predict(X_test_vectors2)

### Micro and Macro Performance Metrics

In [18]:
p5,r5,f5,pp=precision_recall_fscore_support(y_test, y_preds3, average='macro')
p6,r6,f6,pp=precision_recall_fscore_support(y_test, y_preds3, average='micro')

# Performance Comparison

### Macro Metrics

In [20]:
print("for TFIDF")
print('Precision :',p1)
print('Recall :',r1)
print('F1 score :',f1,'\n')

print("for Word2Vec")
print('Precision :',p3)
print('Recall :',r3)
print('F1 score :',f3,'\n')

print("for pre-computed Word2Vec")
print('Precision :',p5)
print('Recall :',r5)
print('F1 score :',f5)


for TFIDF
Precision : 0.9607183743801228
Recall : 0.9599051644700124
F1 score : 0.9602894187113498 

for Word2Vec
Precision : 0.9521490788417697
Recall : 0.9544808110235709
F1 score : 0.9530940172673255 

for pre-computed Word2Vec
Precision : 0.9645029790997534
Recall : 0.9632502256105699
F1 score : 0.9635566688970523


### Micro Metrics

In [21]:
print("for TFIDF")
print('Precision :',p2)
print('Recall :',r2)
print('F1 score :',f2,'\n')

print("for Word2Vec")
print('Precision :',p4)
print('Recall :',r4)
print('F1 score :',f4,'\n')

print("for pre-computed Word2Vec")
print('Precision :',p6)
print('Recall :',r6)
print('F1 score :',f6)

for TFIDF
Precision : 0.9610778443113772
Recall : 0.9610778443113772
F1 score : 0.9610778443113772 

for Word2Vec
Precision : 0.9550898203592815
Recall : 0.9550898203592815
F1 score : 0.9550898203592815 

for pre-computed Word2Vec
Precision : 0.9655688622754491
Recall : 0.9655688622754491
F1 score : 0.9655688622754491
