yahoo datset with bigram

In [7]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import fasttext
import nltk
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the training and test datasets
train_df = pd.read_csv("yahoo_train.csv")
test_df = pd.read_csv("yahoo_test.csv")

# Rename columns
train_df.columns = ['label', 'content', 'extra_feature','extra_feature_2']
test_df.columns = ['label', 'content', 'extra_feature','extra_feature_2']

# Check for missing values and drop them if necessary
train_df = train_df.dropna(subset=['label', 'content'])
test_df = test_df.dropna(subset=['label', 'content'])

# Preprocess the text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

train_df['cleaned_content'] = train_df['content'].apply(preprocess_text)
test_df['cleaned_content'] = test_df['content'].apply(preprocess_text)

# Remove empty rows after preprocessing
train_df = train_df[train_df['cleaned_content'].str.strip() != '']
test_df = test_df[test_df['cleaned_content'].str.strip() != '']

# Format the labels
def format_labels(label):
    return f"__label__{label}"

train_df['formatted_labels'] = train_df['label'].apply(format_labels)
test_df['formatted_labels'] = test_df['label'].apply(format_labels)

# Save training and test data for FastText
with open("train_yahoo.txt", "w") as f:
    for label, content in zip(train_df['formatted_labels'], train_df['cleaned_content']):
        f.write(f"{label} {content}\n")

with open("test_yahoo.txt", "w") as f:
    for label, content in zip(test_df['formatted_labels'], test_df['cleaned_content']):
        f.write(f"{label} {content}\n")

# Check if the training file has valid content
with open("train_yahoo.txt", "r") as f:
    for i in range(5):
        print(f.readline())

# Train the FastText model with tuned parameters
model = fasttext.train_supervised(
    input="train_yahoo.txt", 
    dim=10,  # Increased dimension
    wordNgrams=2,  # Increased to 3-grams for richer context
    epoch=100,  # More epochs for better training
    lr=0.1,  # Adjusted learning rate
    loss='softmax'  # Use softmax for better classification
)

# Save the model
model.save_model("yahoo_model.bin")

# Test the FastText model
result = model.test("test_yahoo.txt")
print(f"Number of examples: {result[0]}")
print(f"Precision (Accuracy): {result[1]}")
print(f"Recall: {result[2]}")

# Calculate and print accuracy
accuracy = result[1]  # Precision is the accuracy
print(f"Test Accuracy: {accuracy:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


__label__6 best offroad motorcycle trail

__label__3 trans fat reduce

__label__7 many plane fedex

__label__7 san francisco bay area make sense rent buy

__label__5 whats best way clean keyboard



Read 10M words
Number of words:  302639
Number of labels: 10
Progress: 100.0% words/sec/thread: 3439683 lr:  0.000000 avg.loss:  0.243856 ETA:   0h 0m 0s


Number of examples: 59814
Precision (Accuracy): 0.6369746213261109
Recall: 0.6369746213261109
Test Accuracy: 0.6370


yahoo datset without bigram

In [8]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import fasttext
import nltk
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the training and test datasets
train_df = pd.read_csv("yahoo_train.csv")
test_df = pd.read_csv("yahoo_test.csv")

# Rename columns
train_df.columns = ['label', 'content', 'extra_feature','extra_feature_2']
test_df.columns = ['label', 'content', 'extra_feature','extra_feature_2']

# Check for missing values and drop them if necessary
train_df = train_df.dropna(subset=['label', 'content'])
test_df = test_df.dropna(subset=['label', 'content'])

# Preprocess the text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

train_df['cleaned_content'] = train_df['content'].apply(preprocess_text)
test_df['cleaned_content'] = test_df['content'].apply(preprocess_text)

# Remove empty rows after preprocessing
train_df = train_df[train_df['cleaned_content'].str.strip() != '']
test_df = test_df[test_df['cleaned_content'].str.strip() != '']

# Format the labels
def format_labels(label):
    return f"__label__{label}"

train_df['formatted_labels'] = train_df['label'].apply(format_labels)
test_df['formatted_labels'] = test_df['label'].apply(format_labels)

# Save training and test data for FastText
with open("train_yahoo.txt", "w") as f:
    for label, content in zip(train_df['formatted_labels'], train_df['cleaned_content']):
        f.write(f"{label} {content}\n")

with open("test_yahoo.txt", "w") as f:
    for label, content in zip(test_df['formatted_labels'], test_df['cleaned_content']):
        f.write(f"{label} {content}\n")

# Check if the training file has valid content
with open("train_yahoo.txt", "r") as f:
    for i in range(5):
        print(f.readline())

# Train the FastText model with tuned parameters
model = fasttext.train_supervised(
    input="train_yahoo.txt", 
    dim=10,
    epoch=100,  # More epochs for better training
    lr=0.1,  # Adjusted learning rate
    loss='softmax'  # Use softmax for better classification
)

# Save the model
model.save_model("yahoo_model.bin")

# Test the FastText model
result = model.test("test_yahoo.txt")
print(f"Number of examples: {result[0]}")
print(f"Precision (Accuracy): {result[1]}")
print(f"Recall: {result[2]}")

# Calculate and print accuracy
accuracy = result[1]  # Precision is the accuracy
print(f"Test Accuracy: {accuracy:.4f}")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


__label__6 best offroad motorcycle trail

__label__3 trans fat reduce

__label__7 many plane fedex

__label__7 san francisco bay area make sense rent buy

__label__5 whats best way clean keyboard



Read 10M words
Number of words:  302639
Number of labels: 10
Progress:  99.7% words/sec/thread: 3979520 lr:  0.000279 avg.loss:  0.834293 ETA:   0h 0m 0s

Number of examples: 59814
Precision (Accuracy): 0.6545290400240746
Recall: 0.6545290400240746
Test Accuracy: 0.6545


Progress: 100.0% words/sec/thread: 3979612 lr:  0.000000 avg.loss:  0.834197 ETA:   0h 0m 0s
