# Text Classification #

### 20newsgroups dataset ###
20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups.

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
import gc
import numpy as np
np.set_printoptions(precision=2)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import pickle

train = fetch_20newsgroups(subset='train', shuffle=True)
train.target_names #prints all the categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
print('Train set size: %s ' % train.target.size)

In [None]:
print('FIRST TEXT CATEGORY: %s \n\n' % train.target_names[train.target[0]])
print('FIRST TEXT: \n')
print('\n'.join(train.data[0].split("\n")[:10])) 

In [None]:
train.target_names

In [None]:
# plot count per category


sns.set_theme(style="whitegrid")


plt.figure(figsize=(15,8))

sns.countplot(x=train.target)

plt.title('Number of texts per category')

plt.xticks(rotation=45)
plt.xlabel('Category')
plt.ylabel('Number of texts')

plt.show()

# 1. Bag of Words  - data representation #

### Vectorization ###

In [None]:
corpus = [
    'aaa aaa aaa aaa aaa bbb',
    'bbb bbb bbb bbb bbb bbb',
    'bbb ccc',
   ]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
X.toarray()

### TF-IDF (TF – term frequency, IDF – inverse document frequency) ###

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X)
X_train_tfidf.toarray()

# 2. Fitting a model, Pipeline #

In [None]:
# Vectorization
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(train.data)
X_train_counts.shape

In [None]:
# Converting to TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
# Using DecisionTreeClassifier

# dtc = DecisionTreeClassifier().fit(X_train_tfidf, train.target)


### Pipeline ###

In [None]:
# We can write less code and do all of the above, by building a pipeline.
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary.
# The purpose of the pipeline is to assemble several steps that can be
# cross-validated together while setting different parameters.



pipe_clf = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()), 
    ('dtc', DecisionTreeClassifier())
])

# Now we can use orginal dataset train.data
pipe_clf = pipe_clf.fit(train.data, train.target)

In [None]:
# Performance of DecisionTreeClassifier
test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = pipe_clf.predict(test.data)
np.mean(predicted == test.target)

# is the result realy bad?

### Grid search ###

In [None]:
# Create a list of parameters and their values to be checked.
# All the parameters name are of the form 'stepName__paramName'.
# E.g. 'vect__ngram_range': [(1, 1), (1, 2)]
# that means use unigram and bigrams and choose the one which is optimal.

parameters = {
    'vect__ngram_range': [(1, 1),(1, 2)],  
    'tfidf__use_idf': (True, False)
#     'dtc__max_depth': (20,40)
}

In [None]:
#BELOW COMMANDS ARE TIME EXPENSIVE!

# n_jobs=-1 means using all cores
# Perheps you may need to run "conda install -c anaconda joblib" 



gs_clf = GridSearchCV(pipe_clf, parameters, n_jobs=-1, verbose=10)

# Run the grid search on the pipeline
gs_clf = gs_clf.fit(train.data, train.target)
print("Best score: %s" % gs_clf.best_score_) 
print("Best param: %s" % gs_clf.best_params_) 

# 3. NLTK - Natural Language Toolkit #

### Stop words ###

In [None]:
# # Removing stop words with CountVectorizer
# text_clf = Pipeline([
#     ('vect', CountVectorizer(stop_words='english')), 
#     ('tfidf', TfidfTransformer()), 
#     ('clf', DecisionTreeClassifier())
# ])

In [None]:
# !pip install nltk


nltk.download('snowball_data')
nltk.download('stopwords')


print(stopwords.words('english'))

### Stemming ###

In [4]:

stemmer = SnowballStemmer("english", ignore_stopwords=True)
print('running --> %s' % stemmer.stem("running"))
print('generously --> %s' %stemmer.stem("generously"))

running --> run
generously --> generous


In [None]:
# Use stemming in the vectorization process

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

pipe_stemmed = Pipeline([
    ('vect', stemmed_count_vect),
    ('tfidf', TfidfTransformer()), 
    ('dtc', DecisionTreeClassifier())
])

pipe_stemmed = pipe_stemmed.fit(train.data, train.target)

predicted_stemmed = pipe_stemmed.predict(test.data)

print('Accuracy after stemming: %s' % np.mean(predicted_stemmed == test.target))

In [None]:


decission_tree_pipe = Pipeline(
    [
        ("vect", stemmed_count_vect),
        ("tfidf", TfidfTransformer()),
        ("dtc", DecisionTreeClassifier()),
    ]
)

parameters = {
    "dtc__max_depth": [None, 20, 30, 40, 50],
    "dtc__min_samples_split": (2, 3),
    "dtc__class_weight": (None, "balanced")
}

decision_tree = GridSearchCV(decission_tree_pipe, parameters, n_jobs=-1, verbose=10, scoring='accuracy')

# Run the grid search on the pipeline
decision_tree = decision_tree.fit(train.data, train.target)
print("Best score: %s" % decision_tree.best_score_)
print("Best param: %s" % decision_tree.best_params_)

# save the model to disk

filename = 'finalized_decison_tree.sav'
pickle.dump(decision_tree, open(filename, 'wb'))

In [14]:
def train_model(pipe, params, train, name):
    model = GridSearchCV(pipe, params, n_jobs=-1, verbose=10, scoring="accuracy")

    # Run the grid search on the pipeline
    model = model.fit(train.data, train.target)
    print("Model: %s" % name)
    print("Best score: %s" % model.best_score_)
    print("Best param: %s" % model.best_params_)

    # save the model to disk

    filename = "finalized_%s.sav" % name
    pickle.dump(model, open(filename, "wb"))



In [15]:
gc.collect()

random_forest_pipe = Pipeline(
    [
        ("vect", stemmed_count_vect),
        ("tfidf", TfidfTransformer()),
        ("rfc", RandomForestClassifier()),
    ]
)

parameters = {
    "vect__ngram_range": [(1, 1), (1, 2)],
    "tfidf__use_idf": (True, False),
    "rfc__n_estimators": [100, 200, 300],
    "rfc__max_depth": [None, 20, 30],
    "rfc__min_samples_split": (2, 3),
}

train_model(random_forest_pipe, parameters, train, "random_forest")

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[CV 2/5; 1/72] START rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=100, tfidf__use_idf=True, vect__ngram_range=(1, 1)
[CV 1/5; 1/72] START rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=100, tfidf__use_idf=True, vect__ngram_range=(1, 1)
[CV 3/5; 1/72] START rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=100, tfidf__use_idf=True, vect__ngram_range=(1, 1)
[CV 2/5; 2/72] START rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=100, tfidf__use_idf=True, vect__ngram_range=(1, 2)
[CV 4/5; 1/72] START rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=100, tfidf__use_idf=True, vect__ngram_range=(1, 1)
[CV 4/5; 2/72] START rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=100, tfidf__use_idf=True, vect__ngram_range=(1, 2)
[CV 5/5; 1/72] START rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=100, tfidf__use_idf=True, vect__ngram_range=(1, 1)
[CV 1/5; 2/72] START rfc__max_depth=None,



[CV 4/5; 9/72] START rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=300, tfidf__use_idf=True, vect__ngram_range=(1, 1)
[CV 1/5; 6/72] END rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=200, tfidf__use_idf=True, vect__ngram_range=(1, 2);, score=0.873 total time=16.1min
[CV 5/5; 9/72] START rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=300, tfidf__use_idf=True, vect__ngram_range=(1, 1)
[CV 2/5; 6/72] END rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=200, tfidf__use_idf=True, vect__ngram_range=(1, 2);, score=0.872 total time=18.4min
[CV 1/5; 10/72] START rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=300, tfidf__use_idf=True, vect__ngram_range=(1, 2)
[CV 3/5; 6/72] END rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimators=200, tfidf__use_idf=True, vect__ngram_range=(1, 2);, score=0.882 total time=18.4min
[CV 2/5; 10/72] START rfc__max_depth=None, rfc__min_samples_split=2, rfc__n_estimato

In [17]:
# load model from disk
loaded_model = pickle.load(open("finalized_random_forest.sav", 'rb'))

test = fetch_20newsgroups(subset='test', shuffle=True)

# Performance of RandomForestClassifier
predicted = loaded_model.predict(test.data)
np.mean(predicted == test.target)

np.float64(0.8122676579925651)

In [None]:
# from sklearn.neighbors import KNeighborsClassifier

# gc.collect()

# k_nearest_neighbours_pipe = Pipeline(
#     [
#         ("vect", stemmed_count_vect),
#         ("tfidf", TfidfTransformer()),
#         ("knn", KNeighborsClassifier()),
#     ]
# )

# parameters = {
#     "knn__n_neighbors": range(1, 10),
#     "knn__weights": ["uniform", "distance"],
# }

# train_model(k_nearest_neighbours_pipe, parameters, train, "k_nearest_neighbours")

In [None]:
# from sklearn.linear_model import LogisticRegression

# gc.collect()

# logistic_regression_pipe = Pipeline(
#     [
#         ("vect", stemmed_count_vect),
#         ("tfidf", TfidfTransformer()),
#         ("lr", LogisticRegression()),
#     ]
# )

# parameters = {
#     "lr__C": [0.001, 0.01, 0.1, 1, 10, 100],
#     "lr__penalty": ["l1", "l2"],
# }

# train_model(logistic_regression_pipe, parameters, train, "logistic_regression")

In [None]:
# from keras.models import Sequential
# from keras.layers import Dense, Dropout
# import keras

# gc.collect()

# callback = keras.callbacks.EarlyStopping(monitor='loss', patience=2)

# def create_model():
#     model = Sequential()
#     model.add(Dense(512, activation="relu"))
#     model.add(Dropout(0.5))
#     model.add(Dense(512, activation="relu"))
#     model.add(Dropout(0.5))
#     model.add(Dense(20, activation="softmax"))
#     model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
#     return model

# stemmed_count_vect = StemmedCountVectorizer(stop_words="english")

# train_stemed = stemmed_count_vect.fit_transform(train.data)
# train_tf = TfidfTransformer(use_idf=False).fit_transform(train_stemed)


# # train with kfolds cross validation
# from sklearn.model_selection import KFold
# from sklearn.metrics import accuracy_score

# kf = KFold(n_splits=5, shuffle=True)
# kf.get_n_splits(train_tf)

# acc_per_fold = []
# loss_per_fold = []

# fold_no = 1
# for train_index, test_index in kf.split(train_tf):
#     model = create_model()

#     X_train, X_test = train_tf[train_index], train_tf[test_index]
#     y_train, y_test = train.target[train_index], train.target[test_index]

#     history = model.fit(X_train.toarray(), y_train, epochs=10, batch_size=128, validation_data=(X_test.toarray(), y_test), callbacks=[callback])

#     scores = model.evaluate(X_test.toarray(), y_test, verbose=0)

#     print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
#     acc_per_fold.append(scores[1] * 100)
#     loss_per_fold.append(scores[0])

#     fold_no = fold_no + 1

# print("Average scores for all folds:")
# print(f"> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})")
# print(f"> Loss: {np.mean(loss_per_fold)}")

I0000 00:00:1734197260.303769   29653 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6063 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 7.5


Epoch 1/10


I0000 00:00:1734197286.039957   29891 service.cc:148] XLA service 0x7fc928003a80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1734197286.041396   29891 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 2080 with Max-Q Design, Compute Capability 7.5
I0000 00:00:1734197286.816970   29891 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 3/71[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 53ms/step - accuracy: 0.0417 - loss: 2.9956

I0000 00:00:1734197290.090497   29891 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 158ms/step - accuracy: 0.2893 - loss: 2.7245 - val_accuracy: 0.8383 - val_loss: 0.9052
Epoch 2/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 57ms/step - accuracy: 0.8737 - loss: 0.5905 - val_accuracy: 0.9032 - val_loss: 0.3818
Epoch 3/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - accuracy: 0.9727 - loss: 0.1319 - val_accuracy: 0.9134 - val_loss: 0.3385
Epoch 4/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.9954 - loss: 0.0433 - val_accuracy: 0.9165 - val_loss: 0.3225
Epoch 5/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.9979 - loss: 0.0189 - val_accuracy: 0.9178 - val_loss: 0.3250
Epoch 6/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 56ms/step - accuracy: 0.9986 - loss: 0.0125 - val_accuracy: 0.9183 - val_loss: 0.3280
Epoch 7/10
[1m71/71[0m [32m━━━━━━━━━━━━━

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.