In [9]:
import pandas as pd

# Load the dataset
data = pd.read_csv("abcnews-date-text1.csv")

# Display the first few rows of the dataset
print(data.head())



   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers


# Bag-of-Words (BoW):

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['headline_text'], data['publish_date'], test_size=0.2, random_state=42)

# BoW with unigrams
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# BoW with bigrams
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_bigram = bigram_vectorizer.fit_transform(X_train)
X_test_bigram = bigram_vectorizer.transform(X_test)

# Train a classifier (e.g., Naive Bayes) and evaluate accuracy
clf = MultinomialNB()
clf.fit(X_train_bow, y_train)
y_pred = clf.predict(X_test_bow)
accuracy_bow = accuracy_score(y_test, y_pred)

clf.fit(X_train_bigram, y_train)
y_pred_bigram = clf.predict(X_test_bigram)
accuracy_bigram = accuracy_score(y_test, y_pred_bigram)

# Compare accuracy of unigram and bigram BoW
print(f"Accuracy (BoW - Unigram): {accuracy_bow}")
print(f"Accuracy (BoW - Bigram): {accuracy_bigram}")


Accuracy (BoW - Unigram): 1.0
Accuracy (BoW - Bigram): 1.0


# Advantages and Limitations of BoW:

Advantages:

Simple and easy to implement.
Captures the presence of words in a document.
Works well for tasks where word order is not crucial.


Limitations:

Ignores word semantics and context.
Treats each word as independent, losing the sequence information.
Sensitive to stop words.

# N-grams:

In [11]:
# N-grams (bi-grams, tri-grams)
trigram_vectorizer = CountVectorizer(ngram_range=(1, 3))
X_train_trigram = trigram_vectorizer.fit_transform(X_train)
X_test_trigram = trigram_vectorizer.transform(X_test)

# Train a classifier and evaluate accuracy
clf.fit(X_train_trigram, y_train)
y_pred_trigram = clf.predict(X_test_trigram)
accuracy_trigram = accuracy_score(y_test, y_pred_trigram)

# Compare accuracy of unigram, bigram, and trigram BoW
print(f"Accuracy (BoW - Unigram): {accuracy_bow}")
print(f"Accuracy (BoW - Bigram): {accuracy_bigram}")
print(f"Accuracy (BoW - Trigram): {accuracy_trigram}")


Accuracy (BoW - Unigram): 1.0
Accuracy (BoW - Bigram): 1.0
Accuracy (BoW - Trigram): 1.0


# TF-IDF:

In [12]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a classifier and evaluate accuracy
clf.fit(X_train_tfidf, y_train)
y_pred_tfidf = clf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)

# Compare accuracy of BoW and TF-IDF
print(f"Accuracy (BoW): {accuracy_bow}")
print(f"Accuracy (TF-IDF): {accuracy_tfidf}")


Accuracy (BoW): 1.0
Accuracy (TF-IDF): 1.0
