dbpedia dataset bigram

In [44]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import fasttext
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load the training and test datasets
train_df = pd.read_csv("dbpedia_train.csv")
test_df = pd.read_csv("dbpedia_test.csv")

# Use the correct columns
train_df.columns = ['label', 'title', 'content']
test_df.columns = ['label', 'title', 'content']

# Enhanced preprocessing function
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    return ' '.join(tokens)

train_df['cleaned_content'] = train_df['content'].apply(preprocess_text)
test_df['cleaned_content'] = test_df['content'].apply(preprocess_text)

# Format the labels
def format_labels(label):
    return f"__label__{label}"

train_df['formatted_labels'] = train_df['label'].apply(format_labels)
test_df['formatted_labels'] = test_df['label'].apply(format_labels)

# Save training and test data for FastText
with open("train_dbpedia.txt", "w") as f:
    for label, content in zip(train_df['formatted_labels'], train_df['cleaned_content']):
        f.write(f"{label} {content}\n")

with open("test_dbpedia.txt", "w") as f:
    for label, content in zip(test_df['formatted_labels'], test_df['cleaned_content']):
        f.write(f"{label} {content}\n")

# Train the FastText model with tuned parameters
model = fasttext.train_supervised(
    input="train_dbpedia.txt",
    dim=10,           # Increase embedding dimension
    wordNgrams=2,     # Include bigrams
    epoch=15,         # Increase number of epochs
    lr=0.5,           # Adjust learning rate
    minCount=5        # Minimum count of words to be considered
)

# Save the model
model.save_model("dbpedia_model.bin")

# Test the FastText model
result = model.test("test_dbpedia.txt")
print(f"Number of examples: {result[0]}")
print(f"Precision (Accuracy): {result[1]}")
print(f"Recall: {result[2]}")

# Calculate and print accuracy
accuracy = result[1]  # Precision is the accuracy in FastText
print(f"Test Accuracy: {accuracy:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Read 16M words
Number of words:  120791
Number of labels: 14
Progress: 100.0% words/sec/thread: 4708104 lr:  0.000000 avg.loss:  0.011751 ETA:   0h 0m 0s


Number of examples: 70000
Precision (Accuracy): 0.9790428571428571
Recall: 0.9790428571428571
Test Accuracy: 0.9790


normal without bigram

In [51]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import fasttext
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load the training and test datasets
train_df = pd.read_csv("dbpedia_train.csv")
test_df = pd.read_csv("dbpedia_test.csv")

# Use the correct columns
train_df.columns = ['label', 'title', 'content']
test_df.columns = ['label', 'title', 'content']

# Enhanced preprocessing function
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    return ' '.join(tokens)

train_df['cleaned_content'] = train_df['content'].apply(preprocess_text)
test_df['cleaned_content'] = test_df['content'].apply(preprocess_text)

# Format the labels
def format_labels(label):
    return f"__label__{label}"

train_df['formatted_labels'] = train_df['label'].apply(format_labels)
test_df['formatted_labels'] = test_df['label'].apply(format_labels)

# Save training and test data for FastText
with open("train_dbpedia.txt", "w") as f:
    for label, content in zip(train_df['formatted_labels'], train_df['cleaned_content']):
        f.write(f"{label} {content}\n")

with open("test_dbpedia.txt", "w") as f:
    for label, content in zip(test_df['formatted_labels'], test_df['cleaned_content']):
        f.write(f"{label} {content}\n")

# Train the FastText model with tuned parameters
model = fasttext.train_supervised(
    input="train_dbpedia.txt",
    dim=10,
    wordNgrams=1,
    epoch=20, # Increase embedding dimension     # Include bigrams         # Increase number of epochs
    lr=0.5,           # Adjust learning rate
    minCount=5        # Minimum count of words to be considered
)

# Save the model
model.save_model("dbpedia_model.bin")

# Test the FastText model
result = model.test("dbpedia_test.txt")
print(f"Number of examples: {result[0]}")
print(f"Precision (Accuracy): {result[1]}")
print(f"Recall: {result[2]}")

# Calculate and print accuracy
accuracy = result[1]  # Precision is the accuracy in FastText
print(f"Test Accuracy: {accuracy:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Read 16M words
Number of words:  120791
Number of labels: 14
Progress: 100.0% words/sec/thread: 6297305 lr:  0.000000 avg.loss:  0.017418 ETA:   0h 0m 0s


Number of examples: 70000
Precision (Accuracy): 0.9675571428571429
Recall: 0.9675571428571429
Test Accuracy: 0.9676
