ag news dataset without bigran

In [25]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import fasttext
import nltk
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the training and test datasets
train_df = pd.read_csv("ag_train.csv")
test_df = pd.read_csv("ag_test.csv")

# Rename all three columns
train_df.columns = ['label', 'content', 'extra_feature']
test_df.columns = ['label', 'content', 'extra_feature']

# Check for missing values and drop them if necessary
train_df = train_df.dropna(subset=['label', 'content'])
test_df = test_df.dropna(subset=['label', 'content'])

# Preprocess the text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

train_df['cleaned_content'] = train_df['content'].apply(preprocess_text)
test_df['cleaned_content'] = test_df['content'].apply(preprocess_text)

# Remove empty rows after preprocessing
train_df = train_df[train_df['cleaned_content'].str.strip() != '']
test_df = test_df[test_df['cleaned_content'].str.strip() != '']

# Format the labels
def format_labels(label):
    return f"__label__{label}"

train_df['formatted_labels'] = train_df['label'].apply(format_labels)
test_df['formatted_labels'] = test_df['label'].apply(format_labels)

# Save training and test data for FastText
with open("train_ag.txt", "w") as f:
    for label, content in zip(train_df['formatted_labels'], train_df['cleaned_content']):
        f.write(f"{label} {content}\n")

with open("test_ag.txt", "w") as f:
    for label, content in zip(test_df['formatted_labels'], test_df['cleaned_content']):
        f.write(f"{label} {content}\n")

# Check if the training file has valid content
with open("train_ag.txt", "r") as f:
    for i in range(5):
        print(f.readline())

# Train the FastText model with tuned parameters (dropout removed)
model = fasttext.train_supervised(
    input="train_ag.txt", 
    dim=10, 
)

# Save the model
model.save_model("ag_model.bin")

# Test the FastText model
result = model.test("test_ag.txt")
print(f"Number of examples: {result[0]}")
print(f"Precision (Accuracy): {result[1]}")
print(f"Recall: {result[2]}")

# Calculate and print accuracy
accuracy = result[1]  # Precision is the accuracy
print(f"Test Accuracy: {accuracy:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


__label__3 wall st bear claw back black reuters

__label__3 carlyle look toward commercial aerospace reuters

__label__3 oil economy cloud stock outlook reuters

__label__3 iraq halt oil export main southern pipeline reuters

__label__3 oil price soar alltime record posing new menace u economy afp



Read 0M words
Number of words:  32552
Number of labels: 4
Progress:  87.6% words/sec/thread: 5247746 lr:  0.012398 avg.loss:  3.548162 ETA:   0h 0m 0s

Number of examples: 7600
Precision (Accuracy): 0.8468421052631578
Recall: 0.8468421052631578
Test Accuracy: 0.8468


Progress: 100.0% words/sec/thread: 2929070 lr:  0.000000 avg.loss:  3.314615 ETA:   0h 0m 0s


ag news datset with bigram

In [24]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import fasttext
import nltk
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the training and test datasets
train_df = pd.read_csv("ag_train.csv")
test_df = pd.read_csv("ag_test.csv")

# Rename all three columns
train_df.columns = ['label', 'content', 'extra_feature']
test_df.columns = ['label', 'content', 'extra_feature']

# Check for missing values and drop them if necessary
train_df = train_df.dropna(subset=['label', 'content'])
test_df = test_df.dropna(subset=['label', 'content'])

# Preprocess the text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

train_df['cleaned_content'] = train_df['content'].apply(preprocess_text)
test_df['cleaned_content'] = test_df['content'].apply(preprocess_text)

# Remove empty rows after preprocessing
train_df = train_df[train_df['cleaned_content'].str.strip() != '']
test_df = test_df[test_df['cleaned_content'].str.strip() != '']

# Format the labels
def format_labels(label):
    return f"__label__{label}"

train_df['formatted_labels'] = train_df['label'].apply(format_labels)
test_df['formatted_labels'] = test_df['label'].apply(format_labels)

# Save training and test data for FastText
with open("train_ag.txt", "w") as f:
    for label, content in zip(train_df['formatted_labels'], train_df['cleaned_content']):
        f.write(f"{label} {content}\n")

with open("test_ag.txt", "w") as f:
    for label, content in zip(test_df['formatted_labels'], test_df['cleaned_content']):
        f.write(f"{label} {content}\n")

# Check if the training file has valid content
with open("train_ag.txt", "r") as f:
    for i in range(5):
        print(f.readline())

# Train the FastText model with tuned parameters (dropout removed)
model = fasttext.train_supervised(
    input="train_ag.txt", 
    dim=10, 
    wordNgrams=2,  # Increased to 3-grams for richer context
    epoch=50,  # More epochs for better training
    lr=0.2,  # Higher learning rate for faster convergence
    minCount=5,  # Lower minCount to capture more words
    loss='softmax',  # Use softmax for better classification
    minn=2, maxn=5,  # Enable subword information
    bucket=2000000  # Use a large bucket for hashing subwords
)

# Save the model
model.save_model("ag_model.bin")

# Test the FastText model
result = model.test("test_ag.txt")
print(f"Number of examples: {result[0]}")
print(f"Precision (Accuracy): {result[1]}")
print(f"Recall: {result[2]}")

# Calculate and print accuracy
accuracy = result[1]  # Precision is the accuracy
print(f"Test Accuracy: {accuracy:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


__label__3 wall st bear claw back black reuters

__label__3 carlyle look toward commercial aerospace reuters

__label__3 oil economy cloud stock outlook reuters

__label__3 iraq halt oil export main southern pipeline reuters

__label__3 oil price soar alltime record posing new menace u economy afp



Read 0M words
Number of words:  10929
Number of labels: 4
Progress:  96.3% words/sec/thread: 1125892 lr:  0.007403 avg.loss:  0.102280 ETA:   0h 0m 0s

Number of examples: 7600
Precision (Accuracy): 0.863421052631579
Recall: 0.863421052631579
Test Accuracy: 0.8634


Progress: 100.0% words/sec/thread: 1125248 lr:  0.000000 avg.loss:  0.099552 ETA:   0h 0m 0s
