# Step 1: Install scikit-learn


In [1]:
!pip install scikit-learn




# Step 2: Import Libraries


In [2]:
from sklearn.feature_extraction.text import CountVectorizer


# Step 3: Sample Text Data


In [3]:
# Sample text data
documents = [
    "Bag of Words is a common technique in natural language processing.",
    "It represents text data as a numerical feature vector.",
    "Scikit-learn provides a convenient CountVectorizer for implementing Bag of Words."
]


# Step 4: Create a Bag of Words Model


In [4]:
# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(documents)

# Convert the result to an array for better readability
bow_array = X.toarray()


# Step 5: Explore the Bag of Words Representation


In [5]:
# Display the feature names (words) in the Bag of Words model
feature_names = vectorizer.get_feature_names_out()
print("Feature Names (Words):", feature_names)

# Display the Bag of Words matrix
print("\nBag of Words Matrix:")
print(bow_array)


Feature Names (Words): ['as' 'bag' 'common' 'convenient' 'countvectorizer' 'data' 'feature' 'for'
 'implementing' 'in' 'is' 'it' 'language' 'learn' 'natural' 'numerical'
 'of' 'processing' 'provides' 'represents' 'scikit' 'technique' 'text'
 'vector' 'words']

Bag of Words Matrix:
[[0 1 1 0 0 0 0 0 0 1 1 0 1 0 1 0 1 1 0 0 0 1 0 0 1]
 [1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0]
 [0 1 0 1 1 0 0 1 1 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 1]]


This code demonstrates how to create a Bag of Words model using scikit-learn's CountVectorizer. The resulting matrix (bow_array) represents the frequency of each word in the documents.

You can use this Bag of Words representation as input to various machine learning models for text classification, clustering, or other natural language processing tasks.

Feel free to replace the sample text data with your own dataset and explore the Bag of Words representation for your specific use case.

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample documents
documents = [
    "Natural language processing is a subfield of artificial intelligence.",
    "Topic modeling is a technique to identify topics present in a text corpus.",
    "SpaCy is a popular Python library for natural language processing.",
    "Latent Dirichlet Allocation is a probabilistic model for topic modeling.",
    "Machine learning algorithms are used for various natural language processing tasks.",
]

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get the feature names (words) and the TF-IDF matrix
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_matrix_array = tfidf_matrix.toarray()

'''

# Display the TF-IDF representation
print("Feature Names (Words):", feature_names)
print("TF-IDF Matrix:")
print(tfidf_matrix_array)


'''

# Convert the TF-IDF matrix to a pandas DataFrame
import pandas as pd
df_tfidf = pd.DataFrame(tfidf_matrix_array, columns=feature_names)

# Display the TF-IDF DataFrame
print(df_tfidf)

   algorithms  allocation       are  artificial    corpus  dirichlet  \
0    0.000000    0.000000  0.000000    0.420222  0.000000   0.000000   
1    0.000000    0.000000  0.000000    0.000000  0.322426   0.000000   
2    0.000000    0.000000  0.000000    0.000000  0.000000   0.000000   
3    0.000000    0.376149  0.000000    0.000000  0.000000   0.376149   
4    0.337214    0.000000  0.337214    0.000000  0.000000   0.000000   

        for  identify        in  intelligence  ...     spacy  subfield  \
0  0.000000  0.000000  0.000000      0.420222  ...  0.000000  0.420222   
1  0.000000  0.322426  0.322426      0.000000  ...  0.000000  0.000000   
2  0.270904  0.000000  0.000000      0.000000  ...  0.404509  0.000000   
3  0.251911  0.000000  0.000000      0.000000  ...  0.000000  0.000000   
4  0.225836  0.000000  0.000000      0.000000  ...  0.000000  0.000000   

      tasks  technique      text        to     topic    topics      used  \
0  0.000000   0.000000  0.000000  0.000000  0.

In [9]:
df_tfidf

Unnamed: 0,algorithms,allocation,are,artificial,corpus,dirichlet,for,identify,in,intelligence,...,spacy,subfield,tasks,technique,text,to,topic,topics,used,various
0,0.0,0.0,0.0,0.420222,0.0,0.0,0.0,0.0,0.0,0.420222,...,0.0,0.420222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.322426,0.0,0.0,0.322426,0.322426,0.0,...,0.0,0.0,0.0,0.322426,0.322426,0.322426,0.260131,0.322426,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.270904,0.0,0.0,0.0,...,0.404509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.376149,0.0,0.0,0.0,0.376149,0.251911,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.303474,0.0,0.0,0.0
4,0.337214,0.0,0.337214,0.0,0.0,0.0,0.225836,0.0,0.0,0.0,...,0.0,0.0,0.337214,0.0,0.0,0.0,0.0,0.0,0.337214,0.337214
