In [6]:
import re
from sklearn.feature_extraction.text import CountVectorizer

file_names = ['ai1.txt', 'ai2.txt', 'ai3.txt']
documents = []

# Step 1: Read and clean each file
for file in file_names:
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove punctuation/special chars
        text = re.sub(r'\s+', ' ', text)         # Remove extra spaces
        text = text.lower()                      # Lowercase
        documents.append(text)

# Step 2: One-hot encoding using CountVectorizer
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(documents)

# Step 3: Display the result
print("Feature names (vocabulary):")
print(vectorizer.get_feature_names_out())

print("\nOne-hot encoded matrix:")
print(X.toarray())


Feature names (vocabulary):
['about' 'across' 'act' 'actions' 'acts' 'advanced' 'after' 'again'
 'alliance' 'alliances' 'also' 'among' 'an' 'and' 'announced' 'archery'
 'are' 'arena' 'as' 'audience' 'balance' 'be' 'become' 'becomes' 'before'
 'begin' 'begins' 'berries' 'best' 'both' 'boy' 'bravery' 'but' 'by'
 'called' 'can' 'capitol' 'captivated' 'cave' 'change' 'choose' 'chosen'
 'compassion' 'compete' 'complicated' 'confused' 'consists' 'continues'
 'contrast' 'control' 'convince' 'dangers' 'deadlier' 'death' 'declare'
 'declares' 'defiance' 'defies' 'destroyed' 'determination' 'district'
 'districts' 'during' 'dystopian' 'each' 'eat' 'entering' 'even' 'event'
 'everdeen' 'every' 'exists' 'expects' 'face' 'feelings' 'female' 'fight'
 'finnick' 'fireballs' 'firsthand' 'for' 'forced' 'forces' 'former'
 'forms' 'friend' 'from' 'gain' 'gale' 'gamemakers' 'games' 'girl' 'give'
 'growing' 'guilt' 'has' 'he' 'helps' 'her' 'hide' 'his' 'home' 'honors'
 'hope' 'horizon' 'hosts' 'however' 'hu

In [10]:
import re
from sklearn.feature_extraction.text import CountVectorizer

file_names = ['ai1.txt', 'ai2.txt', 'ai3.txt']
documents = []


for file in file_names:
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove punctuation/special chars
        text = re.sub(r'\s+', ' ', text)         # Remove extra spaces
        text = text.lower()                      # Lowercase
        documents.append(text)

bow_vectorizer = CountVectorizer()
X = bow_vectorizer.fit_transform(documents)

print("Feature names (vocabulary):")
print(vectorizer.get_feature_names_out())

print("\nBOW  matrix:")
print(X.toarray())


Feature names (vocabulary):
['about' 'across' 'act' 'actions' 'acts' 'advanced' 'after' 'again'
 'alliance' 'alliances' 'also' 'among' 'an' 'and' 'announced' 'archery'
 'are' 'arena' 'as' 'audience' 'balance' 'be' 'become' 'becomes' 'before'
 'begin' 'begins' 'berries' 'best' 'both' 'boy' 'bravery' 'but' 'by'
 'called' 'can' 'capitol' 'captivated' 'cave' 'change' 'choose' 'chosen'
 'compassion' 'compete' 'complicated' 'confused' 'consists' 'continues'
 'contrast' 'control' 'convince' 'dangers' 'deadlier' 'death' 'declare'
 'declares' 'defiance' 'defies' 'destroyed' 'determination' 'district'
 'districts' 'during' 'dystopian' 'each' 'eat' 'entering' 'even' 'event'
 'everdeen' 'every' 'exists' 'expects' 'face' 'feelings' 'female' 'fight'
 'finnick' 'fireballs' 'firsthand' 'for' 'forced' 'forces' 'former'
 'forms' 'friend' 'from' 'gain' 'gale' 'gamemakers' 'games' 'girl' 'give'
 'growing' 'guilt' 'has' 'he' 'helps' 'her' 'hide' 'his' 'home' 'honors'
 'hope' 'horizon' 'hosts' 'however' 'hu

In [12]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer

file_names = ['ai1.txt', 'ai2.txt', 'ai3.txt']
documents = []


for file in file_names:
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove punctuation/special chars
        text = re.sub(r'\s+', ' ', text)         # Remove extra spaces
        text = text.lower()                      # Lowercase
        documents.append(text)

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(documents)

print("Feature names (vocabulary):")
print(vectorizer.get_feature_names_out())

print("\tfidf  matrix:")
print(X.toarray())


Feature names (vocabulary):
['about' 'across' 'act' 'actions' 'acts' 'advanced' 'after' 'again'
 'alliance' 'alliances' 'also' 'among' 'an' 'and' 'announced' 'archery'
 'are' 'arena' 'as' 'audience' 'balance' 'be' 'become' 'becomes' 'before'
 'begin' 'begins' 'berries' 'best' 'both' 'boy' 'bravery' 'but' 'by'
 'called' 'can' 'capitol' 'captivated' 'cave' 'change' 'choose' 'chosen'
 'compassion' 'compete' 'complicated' 'confused' 'consists' 'continues'
 'contrast' 'control' 'convince' 'dangers' 'deadlier' 'death' 'declare'
 'declares' 'defiance' 'defies' 'destroyed' 'determination' 'district'
 'districts' 'during' 'dystopian' 'each' 'eat' 'entering' 'even' 'event'
 'everdeen' 'every' 'exists' 'expects' 'face' 'feelings' 'female' 'fight'
 'finnick' 'fireballs' 'firsthand' 'for' 'forced' 'forces' 'former'
 'forms' 'friend' 'from' 'gain' 'gale' 'gamemakers' 'games' 'girl' 'give'
 'growing' 'guilt' 'has' 'he' 'helps' 'her' 'hide' 'his' 'home' 'honors'
 'hope' 'horizon' 'hosts' 'however' 'hu