<a href="https://colab.research.google.com/github/Sara-Esm/NLP/blob/main/Lab_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 1: Text Classification using Bag-of-Words(BoW)



In [None]:
#!pip install pandas
#!pip install sklearn

**Objective**: In this lab, you will apply BoW to build a simple text classification model. You will use the emotions dataset which contains a sentence and the label associating the emotion. You will first experiment with using the frequency count method and then move on to TF-IDF to see if there are any differences in performance.

In [None]:
# This sets the tensorflow log level to "warn"
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
import pandas as pd

# Load dataset
dataset = pd.read_csv("Datasets/emotions_dataset.txt",sep=";",
    names=["Description","Emotion"] )

In [None]:
dataset

Unnamed: 0,Description,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger


# Create vocabulary first with Frequency Count

In [None]:
# Lets create a frequency count vocabulary first
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(dataset["Description"])
vectorizer.get_feature_names_out()

array(['aa', 'aaaaaaand', 'aaaaand', ..., 'zum', 'zumba', 'zz'],
      dtype=object)

In [None]:
print("There are: " + str(len(vectorizer.get_feature_names_out())) + " unique words in the dataset.")

There are: 15186 unique words in the dataset.


In [None]:
# We can see the vocubalry with the frequency count
pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())

Unnamed: 0,aa,aaaaaaand,aaaaand,aaaand,aac,aahhh,aaron,ab,abandon,abandoned,...,zonisamide,zoo,zoom,zooming,zooms,zq,zucchini,zum,zumba,zz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# The first row in the dataset has the word humilated which is shown in the vocabulary
pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())["humiliated"]

0        1
1        0
2        0
3        0
4        0
        ..
15995    0
15996    0
15997    0
15998    0
15999    0
Name: humiliated, Length: 16000, dtype: int64

In [None]:
# X is now the sparse matrix with all the features
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
# Y is the predictor variable  (i.e., the emotion)
y = dataset["Emotion"]

In [None]:
# Lets train a KNN model to see how the frequency count performs
from sklearn.neighbors import KNeighborsClassifier

In [None]:
model = KNeighborsClassifier(n_neighbors=3)

In [None]:
model.fit(X, y)  # KNN models do not require training so no need to split the dataset

In [None]:
model.score(X, y)

0.653625

# Lets try with TF-IDF to see any difference

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset["Description"])
vectorizer.get_feature_names_out()

array(['aa', 'aaaaaaand', 'aaaaand', ..., 'zum', 'zumba', 'zz'],
      dtype=object)

In [None]:
print("There are: " + str(len(vectorizer.get_feature_names_out())) + " unique words in the dataset.")

There are: 15186 unique words in the dataset.


In [None]:
# We can see the vocubalry with the TF-IDF are floats (i.e., not 0 or 1)
pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())

Unnamed: 0,aa,aaaaaaand,aaaaand,aaaand,aac,aahhh,aaron,ab,abandon,abandoned,...,zonisamide,zoo,zoom,zooming,zooms,zq,zucchini,zum,zumba,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# The first row in the dataset has the word humilated which is shown in the vocabulary
pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())["humiliated"]

0        0.786766
1        0.000000
2        0.000000
3        0.000000
4        0.000000
           ...   
15995    0.000000
15996    0.000000
15997    0.000000
15998    0.000000
15999    0.000000
Name: humiliated, Length: 16000, dtype: float64

In [None]:
# X is now the sparse matrix with all the features
X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
# Lets train a KNN model again to see if it performs better using TF-IDF
from sklearn.neighbors import KNeighborsClassifier

In [None]:
model = KNeighborsClassifier(n_neighbors=3)

In [None]:
model.fit(X, y) # KNN models do not require training so no need to split the dataset

In [None]:
# Comapare the accuracy score of the TF-IDF trained model with the frequency count one
model.score(X, y)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f63a90411c0>
Traceback (most recent call last):
  File "/home/student/anaconda3/envs/crs1293/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/student/anaconda3/envs/crs1293/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/student/anaconda3/envs/crs1293/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/home/student/anaconda3/envs/crs1293/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^

0.83325