yelp dataset, with bigram

In [7]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import fasttext
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load the training and test datasets
train_df = pd.read_csv("yelp_train.csv")
test_df = pd.read_csv("yelp_test.csv")

# Check and rename columns if necessary
train_df.columns = ['label', 'content']
test_df.columns = ['label', 'content']

# Preprocess the text
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

train_df['cleaned_content'] = train_df['content'].apply(preprocess_text)
test_df['cleaned_content'] = test_df['content'].apply(preprocess_text)

# Format the labels
def format_labels(label):
    return f"__label__{label}"

train_df['formatted_labels'] = train_df['label'].apply(format_labels)
test_df['formatted_labels'] = test_df['label'].apply(format_labels)

# Save training and test data for FastText
with open("train_yelp.txt", "w") as f:
    for label, content in zip(train_df['formatted_labels'], train_df['cleaned_content']):
        f.write(f"{label} {content}\n")

with open("test_yelp.txt", "w") as f:
    for label, content in zip(test_df['formatted_labels'], test_df['cleaned_content']):
        f.write(f"{label} {content}\n")

# Train the FastText model with dim=10 and bigrams
model = fasttext.train_supervised(
    input="train_yelp.txt",
    dim=10,           # Set embedding dimension to 10
    wordNgrams=2      # Include bigrams
)

# Save the model
model.save_model("yelp_model.bin")

# Test the FastText model
result = model.test("test_yelp.txt")
print(f"Number of examples: {result[0]}")
print(f"Precision (Accuracy): {result[1]}")
print(f"Recall: {result[2]}")

# Calculate and print accuracy
accuracy = result[1]  # Precision is the accuracy in FastText
print(f"Test Accuracy: {accuracy:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Read 38M words
Number of words:  698628
Number of labels: 2
Progress: 100.0% words/sec/thread: 5664571 lr:  0.000000 avg.loss:  0.126737 ETA:   0h 0m 0s


Number of examples: 37999
Precision (Accuracy): 0.9440511592410327
Recall: 0.9440511592410327
Test Accuracy: 0.9441


normal without bigram

In [9]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import fasttext
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load the training and test datasets
train_df = pd.read_csv("yelp_train.csv")
test_df = pd.read_csv("yelp_test.csv")

# Check and rename columns if necessary
train_df.columns = ['label', 'content']
test_df.columns = ['label', 'content']

# Preprocess the text
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

train_df['cleaned_content'] = train_df['content'].apply(preprocess_text)
test_df['cleaned_content'] = test_df['content'].apply(preprocess_text)

# Format the labels
def format_labels(label):
    return f"__label__{label}"

train_df['formatted_labels'] = train_df['label'].apply(format_labels)
test_df['formatted_labels'] = test_df['label'].apply(format_labels)

# Save training and test data for FastText
with open("train_yelp.txt", "w") as f:
    for label, content in zip(train_df['formatted_labels'], train_df['cleaned_content']):
        f.write(f"{label} {content}\n")

with open("test_yelp.txt", "w") as f:
    for label, content in zip(test_df['formatted_labels'], test_df['cleaned_content']):
        f.write(f"{label} {content}\n")

# Train the FastText model with dim=10 and bigrams
model = fasttext.train_supervised(
    input="train_yelp.txt")

# Save the model
model.save_model("yelp_model.bin")

# Test the FastText model
result = model.test("test_yelp.txt")
print(f"Number of examples: {result[0]}")
print(f"Precision (Accuracy): {result[1]}")
print(f"Recall: {result[2]}")

# Calculate and print accuracy
accuracy = result[1]  # Precision is the accuracy in FastText
print(f"Test Accuracy: {accuracy:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/puneethreddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Read 38M words
Number of words:  698628
Number of labels: 2
Progress: 100.0% words/sec/thread: 5890395 lr:  0.000000 avg.loss:  0.214680 ETA:   0h 0m 0s

Number of examples: 37999
Precision (Accuracy): 0.9306034369325509
Recall: 0.9306034369325509
Test Accuracy: 0.9306


 0s
