# Basic Text Classification

In [1]:
import os
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [2]:
train_data = r"C:\Users\prate\Downloads\1746559-1587322-1552868-1449301-r8-train-all-terms_(2)\1552868-1449301-r8-train-all-terms.txt"
test_data = r"C:\Users\prate\Downloads\1746561-1587323-1552872-1449303-r8-test-all-terms_(1)_(2)\1552872-1449303-r8-test-all-terms_(1).txt"

### Reading the Training Data

In [3]:
X, y = [], []

with open(train_data, "r") as f:
    for line in f:
        label, text = line.split('\t')
        X.append(text.split())
        y.append(label)
    X, y = np.array(X), np.array(y)

In [4]:
y

array(['earn', 'acq', 'earn', ..., 'earn', 'money-fx', 'ship'],
      dtype='<U8')

In [5]:
len(X),len(y)

(5485, 5485)

### Reading the Test Data

In [6]:
X_test, y_test = [], []

with open(test_data, 'r') as f:
    for line in f:
        label, text = line.split('\t')
        X_test.append(text.split())
        y_test.append(label)
    X_test, y_test = np.array(X_test), np.array(y_test)

In [7]:
len(X_test), len(y_test)

(2189, 2189)

In [8]:
np.unique(y, return_counts=True)

(array(['acq', 'crude', 'earn', 'grain', 'interest', 'money-fx', 'ship',
        'trade'], dtype='<U8'),
 array([1596,  253, 2840,   41,  190,  206,  108,  251], dtype=int64))

In [9]:
X_text = [" ".join(val) for val in X]
print(X_text[0]+ "\n")

X_test_text = [' '.join(val) for val in X_test]
print(X_test_text[0]+ "\n")

champion products ch approves stock split champion products inc said its board of directors approved a two for one stock split of its common shares for shareholders of record as of april the company also said its board voted to recommend to shareholders at the annual meeting april an increase in the authorized capital stock from five mln to mln shares reuter




## Using Count Vectorizer

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
vect = CountVectorizer(stop_words='english',max_features=5000)
vect.fit(X_text)

In [12]:
X_train_transformed = vect.transform(X_text)
X_test_transformed = vect.transform(X_test_text)

In [13]:
list(vect.vocabulary_.items())[:10]

[('champion', 759),
 ('products', 3488),
 ('approves', 264),
 ('stock', 4337),
 ('split', 4273),
 ('said', 3974),
 ('board', 532),
 ('directors', 1327),
 ('approved', 263),
 ('common', 895)]

In [14]:
len(vect.vocabulary_)

5000

### Using NB Methods

#### Using Bernoulli NB 

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

In [16]:
bnb.fit(X_train_transformed, y)

pred_train_ys = bnb.predict(X_train_transformed)
pred_test_ys = bnb.predict(X_test_transformed)

print('Train Accuracy', accuracy_score(y, pred_train_ys))
print('Test Accuracy', accuracy_score(y_test, pred_test_ys))

Train Accuracy 0.8736554238833182
Test Accuracy 0.8688899040657835


#### Using Multinomial NB

In [17]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

In [18]:
mnb.fit(X_train_transformed, y)

pred_train_ys = mnb.predict(X_train_transformed)
pred_test_ys = mnb.predict(X_test_transformed)

print('Train Accuracy', accuracy_score(y, pred_train_ys))
print('Test Accuracy', accuracy_score(y_test, pred_test_ys))

Train Accuracy 0.968094804010939
Test Accuracy 0.9657377798081316


## Using Word Embeddings

### Using Pre-Trained Model : GLOVE

In [19]:
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = r"D:\DataScience\NLP\Gensim Dictionary\glove.6B\glove.6B.200d.txt"
word2vec_output_file = 'glove.6B.200d.w2vformat.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 200)

In [20]:
from gensim.models.keyedvectors import KeyedVectors

glove_model = KeyedVectors.load_word2vec_format('glove.6B.200d.w2vformat.txt',
                                                binary=False)

In [21]:
import nltk
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

In [22]:
def sent_vec(sent):
    wv_res = np.zeros(glove_model.vector_size)
    ctr = 1
    for w in sent:
        if w in glove_model:
            ctr += 1
            wv_res += glove_model[w]
        wv_res = wv_res / ctr
        return wv_res

In [23]:
train_doc_vecs = []
for doc in X:
    doc_words = [term for term in doc if term not in stop_words]
    train_doc_vecs.append(sent_vec(doc_words))

In [24]:
test_doc_vecs = []
for doc in X_test:
    doc_words = [term for term in doc if term not in stop_words]
    test_doc_vecs.append(sent_vec(doc_words))

#### Using Logistic Regression Model

In [25]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear',
                            penalty="l1",
                            random_state=42,
                            C=3.5)

In [26]:
logreg.fit(X_train_transformed, y)

pred_train_ys = logreg.predict(X_train_transformed)
pred_test_ys = logreg.predict(X_test_transformed)

print('Train Accuracy', accuracy_score(y, pred_train_ys))
print('Test Accuracy', accuracy_score(y_test, pred_test_ys))

Train Accuracy 0.9985414767547858
Test Accuracy 0.9634536317953404
