In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import GaussianNB

In [2]:
dbpedia_df = pd.read_csv('data/dbpedia_csv/train.csv',
                            skiprows=1, names=['Label', 'Name', 'Text'])

In [3]:
dbpedia_df.head()

Unnamed: 0,Label,Name,Text
0,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
1,1,Q-workshop,Q-workshop is a Polish company located in Poz...
2,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
3,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...
4,1,The Unsigned Guide,The Unsigned Guide is an online contacts dire...


In [4]:
dbpedia_df.shape

(559999, 3)

In [5]:
dbpedia_df.Label.value_counts()

2     40000
3     40000
4     40000
5     40000
6     40000
7     40000
8     40000
9     40000
10    40000
11    40000
12    40000
13    40000
14    40000
1     39999
Name: Label, dtype: int64

In [6]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [7]:
dbpedia_df.shape

(10000, 3)

In [8]:
dbpedia_df.Label.value_counts()

7     762
14    738
5     726
2     724
3     721
6     721
8     721
4     714
10    711
9     709
11    707
12    695
13    676
1     675
Name: Label, dtype: int64

In [9]:
X = dbpedia_df.Text
y = dbpedia_df.Label

In [22]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f'Length of test data: {len(y_test)}')
    print(f'Accuracy Count: {num_acc}')
    print(f'Accuracy Score: {acc}')
    print(f'Precision Score: {prec}')
    print(f'Recall Score: {rec}')
    print(f'F1 Score: {f1}')

In [34]:
print('*' * 50)
print('HashingVectorizer')
print('*' * 50)
vectorizer = HashingVectorizer(n_features=2**10, norm='l2')
feature_vector = vectorizer.fit_transform(X)
print(feature_vector.shape)
X_dense = np.asarray(feature_vector.todense())
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42)
clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)
summarize_classification(y_test, y_pred)

**************************************************
HashingVectorizer
**************************************************
(10000, 1024)
Length of test data: 2000
Accuracy Count: 1134
Accuracy Score: 0.567
Precision Score: 0.5831351495696112
Recall Score: 0.567
F1 Score: 0.5702086443598284


In [16]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')
analyzer = HashingVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

In [17]:
stem_vectorizer = HashingVectorizer(analyzer=stemmed_words, n_features=2**10, norm='l2')
feature_vector = stem_vectorizer.fit_transform(X)
feature_vector.shape

(10000, 1024)

In [35]:
print('*' * 50)
print('Stemmed HashingVectorizer')
print('*' * 50)
X_dense = np.asarray(feature_vector.todense())
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)
summarize_classification(y_test, y_pred)

**************************************************
Stemmed HashingVectorizer
**************************************************
(8000, 1024) (2000, 1024) (8000,) (2000,)
Length of test data: 2000
Accuracy Count: 1134
Accuracy Score: 0.567
Precision Score: 0.5831351495696112
Recall Score: 0.567
F1 Score: 0.5702086443598284


In [36]:
from sklearn.feature_extraction.text import CountVectorizer

print('*' * 50)
print('CountVectorizer')
print('*' * 50)
count_vectorizer = CountVectorizer(max_features=2000)
feature_vector = count_vectorizer.fit_transform(X)
print(f'Shape of feature vector: {feature_vector.shape}')
X_dense = feature_vector.toarray()
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)
summarize_classification(y_test, y_pred)

**************************************************
CountVectorizer
**************************************************
Shape of feature vector: (10000, 2000)
(8000, 2000) (2000, 2000) (8000,) (2000,)
Length of test data: 2000
Accuracy Count: 1465
Accuracy Score: 0.7325
Precision Score: 0.7328213367424521
Recall Score: 0.7325
F1 Score: 0.72987350840779


In [38]:
print('*' * 50)
print('CountVectorizer with Stop Words Removed')
print('*' * 50)
count_vectorizer = CountVectorizer(max_features=2000, stop_words='english')
feature_vector = count_vectorizer.fit_transform(X)
print(f'Shape of feature vector: {feature_vector.shape}')
X_dense = feature_vector.toarray()
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)
summarize_classification(y_test, y_pred)

**************************************************
CountVectorizer with Stop Words Removed
**************************************************
Shape of feature vector: (10000, 2000)
(8000, 2000) (2000, 2000) (8000,) (2000,)
Length of test data: 2000
Accuracy Count: 1466
Accuracy Score: 0.733
Precision Score: 0.7309370930227532
Recall Score: 0.733
F1 Score: 0.7300280347618526


In [40]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.feature_extraction import text

print('*' * 50)
print('CountVectorizer with Frequency Threshold')
print('*' * 50)
tokens = word_tokenize('\n'.join(X.values))
print(f'Number of tokens: {len(tokens)}')
fdist = FreqDist(tokens)
freq_words = []
for word, freq in fdist.items():
    if freq >= 100:
        freq_words.append(word.lower())
print(f'Number of frequent words: {len(freq_words)}')
stop_words = text.ENGLISH_STOP_WORDS.union(freq_words)

count_vectorizer = CountVectorizer(max_features=2000, stop_words=stop_words)
feature_vector = count_vectorizer.fit_transform(X)
print(f'Shape of feature vector: {feature_vector.shape}')
X_dense = feature_vector.toarray()
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42)
clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)
summarize_classification(y_test, y_pred)

**************************************************
CountVectorizer with Frequency Threshold
**************************************************
Number of tokens: 508124
Number of frequent words: 503


  % sorted(inconsistent)


Shape of feature vector: (10000, 2000)
Length of test data: 2000
Accuracy Count: 1306
Accuracy Score: 0.653
Precision Score: 0.6540085429885271
Recall Score: 0.653
F1 Score: 0.6454378335760009


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
print('*' * 50)
print('TfidfVectorizer with Stop Words Removed')
print('*' * 50)
tfidf_vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
feature_vector = tfidf_vectorizer.fit_transform(X)
print(f'Shape of feature vector: {feature_vector.shape}')
X_dense = feature_vector.toarray()
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42)
clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)
summarize_classification(y_test, y_pred)

**************************************************
TfidfVectorizer with Stop Words Removed
**************************************************
Shape of feature vector: (10000, 2000)
Length of test data: 2000
Accuracy Count: 1464
Accuracy Score: 0.732
Precision Score: 0.7369835482836727
Recall Score: 0.732
F1 Score: 0.7328047604979636


In [44]:
print('*' * 50)
print('Bag of n-grams vectorizer')
print('*' * 50)
count_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=2000)
feature_vector = count_vectorizer.fit_transform(X)
print(f'Shape of feature vector: {feature_vector.shape}')
X_dense = feature_vector.toarray()
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42)
clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)
summarize_classification(y_test, y_pred)

**************************************************
Bag of n-grams vectorizer
**************************************************
Shape of feature vector: (10000, 2000)
Length of test data: 2000
Accuracy Count: 1584
Accuracy Score: 0.792
Precision Score: 0.7951879407472687
Recall Score: 0.792
F1 Score: 0.7861216325000008


In [45]:
print('*' * 50)
print('Bag of n-grams vectorizer with Stop Words Removed')
print('*' * 50)
count_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=2000, stop_words='english')
feature_vector = count_vectorizer.fit_transform(X)
print(f'Shape of feature vector: {feature_vector.shape}')
X_dense = feature_vector.toarray()
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42)
clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)
summarize_classification(y_test, y_pred)

**************************************************
Bag of n-grams vectorizer with Stop Words Removed
**************************************************
Shape of feature vector: (10000, 2000)
Length of test data: 2000
Accuracy Count: 1414
Accuracy Score: 0.707
Precision Score: 0.7910931651237748
Recall Score: 0.707
F1 Score: 0.7170837940139252
