In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer

In [41]:
from pathlib import Path
DATA_PATH = Path() / "../data"
DATA_PATH.mkdir(parents=True,exist_ok=True)

def load_data(filename, data_path=DATA_PATH,encoding='ISO-8859-1'):
    csv_path = data_path / filename
    return pd.read_csv(csv_path,encoding=encoding)

def save_data(data, filename, data_path=DATA_PATH,encoding='ISO-8859-1'):
    csv_path = data_path / filename
    data.to_csv(csv_path, index=False,encoding='ISO-8859-1')

PLOT_PATH = Path() / "../plot"
PLOT_PATH.mkdir(parents=True,exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300, transparent=True):
    path = PLOT_PATH / f"{fig_id}.{fig_extension}"
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution, transparent=transparent)

In [42]:
df = load_data("processed_rev1.csv")


# Model Improvement

In [43]:
from nltk import pos_tag, word_tokenize
from nltk.util import ngrams

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

def extract_extended_phrases(text):
    # Tokenize and POS tag
    tokens = word_tokenize(text.lower())
    tagged_tokens = pos_tag(tokens)
    
    # Generate bigrams and trigrams
    bi_grams = list(ngrams(tagged_tokens, 2))
    tri_grams = list(ngrams(tagged_tokens, 3))
    
    # Define patterns to look for in bigrams
    patterns = [
        ('JJ', 'NN'),  # Adjective-Noun
        ('NN', 'NN'),  # Noun-Noun
        ('RB', 'JJ'),  # Adverb-Adjective (assuming no check for negations here)
        ('VB', 'RB'),  # Verb-Adverb
        ('VB', 'NN')   # Verb-Noun
    ]
    
    # Define patterns to look for in trigrams
    tri_patterns = [
        ('JJ', 'NN', 'NN'),  # Adjective-Noun-Noun
        ('RB', 'JJ', 'NN'),  # Adverb-Adjective-Noun
        ('NN', 'IN', 'NN')   # Noun-Preposition-Noun
    ]
    
    # Extract phrases based on bigram patterns
    phrases = [' '.join([a, b]) for ((a, tag_a), (b, tag_b)) in bi_grams if (tag_a, tag_b) in patterns]
    
    # Extract phrases based on trigram patterns
    phrases += [' '.join([a, b, c]) for ((a, tag_a), (b, tag_b), (c, tag_c)) in tri_grams if (tag_a, tag_b, tag_c) in tri_patterns]
    
    return phrases


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\TYS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TYS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [44]:
documents = [(extract_extended_phrases(row['processed_FullDescription']), row['Target']) for index, row in df.iterrows()]

In [45]:
all_phrases = []
for document, _ in documents:
    # Since document is already a list, directly extend all_phrases with it
    all_phrases.extend(document)

In [46]:
# Calculate frequency distribution of these phrases
from nltk import FreqDist
all_phrases_freq = FreqDist(all_phrases)

# Select the most common phrases as features
word_features = [phrase for phrase, count in all_phrases_freq.most_common(2000)]

In [47]:
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in document_words)
    return features

In [48]:
# Assuming you have a list of (document, label) pairs
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]

In [49]:
# Model 
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.77


# Incorporate other Features

In [50]:
df['POS_features'] = df['processed_FullDescription'].apply(extract_extended_phrases)

# Join the extracted features into a single string per document, if not already
df['POS_features'] = df['POS_features'].apply(lambda x: ' '.join(x))

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
import pandas as pd

# Assuming df is your DataFrame
# Assuming 'POS_features' column exists and 'Target' is the label column

# Initialize vectorizers
category_vectorizer = CountVectorizer()
location_vectorizer = CountVectorizer()
title_vectorizer = CountVectorizer()
pos_vectorizer = CountVectorizer()

# Vectorize features
category_features = category_vectorizer.fit_transform(df['Category'])
location_features = location_vectorizer.fit_transform(df['LocationNormalized'])
title_features = title_vectorizer.fit_transform(df['Title'])
pos_features = pos_vectorizer.fit_transform(df['POS_features'])

# Define combinations of features to test
feature_combinations = {
    'Category + Location': hstack([category_features, location_features]),
    'Category + Title': hstack([category_features, title_features]),
    'Category + POS': hstack([category_features, pos_features]),
    'Location + Title': hstack([location_features, title_features]),
    'Location + POS': hstack([location_features, pos_features]),
    'Title + POS': hstack([title_features, pos_features]),
    'All Features': hstack([category_features, location_features, title_features, pos_features])
}

# Prepare labels
y = df['Target']

# Loop through each combination
for combo_name, combo_features in feature_combinations.items():
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(combo_features, y, test_size=0.2, random_state=42)
    
    # Initialize and train the classifier
    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)
    
    # Evaluate the classifier
    accuracy = classifier.score(X_test, y_test)
    print(f"Accuracy of {combo_name}: {accuracy}")


Accuracy of Category + Location: 0.722
Accuracy of Category + Title: 0.792
Accuracy of Category + POS: 0.76
Accuracy of Location + Title: 0.774
Accuracy of Location + POS: 0.764
Accuracy of Title + POS: 0.776
Accuracy of All Features: 0.768


# Implemening POS tagging on 'Category', 'LocationNormalized', and 'Title'

In [55]:
# Join the lists of phrases into strings
df['Category_POS'] = df['Category'].apply(lambda x: ' '.join(extract_extended_phrases(x)))
df['Location_POS'] = df['LocationNormalized'].apply(lambda x: ' '.join(extract_extended_phrases(x)))
df['Title_POS'] = df['Title'].apply(lambda x: ' '.join(extract_extended_phrases(x)))



In [57]:
# Vectorize features after ensuring they are in string format
vectorizers = {
    'Category_POS': CountVectorizer(),
    'Location_POS': CountVectorizer(),
    'Title_POS': CountVectorizer(),
    'POS_features': CountVectorizer()
    # Add any other fields you want to vectorize
}

# Vectorize features
vectorized_features = {name: vect.fit_transform(df[name]) for name, vect in vectorizers.items()}

# Define combinations of features to test
feature_combinations = [
    ['POS_features', 'Location_POS'],
    ['POS_features', 'Title_POS'],
    ['Location_POS', 'POS_features'],
    ['Category_POS', 'Location_POS', 'Title_POS','POS_features'],
    ['Category_POS', 'POS_features'],
]

# Prepare labels
y = df['Target']

# Loop through each combination, train, and evaluate the classifier
for combo in feature_combinations:
    combined_features = hstack([vectorized_features[feature] for feature in combo])
    X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.2, random_state=42)
    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)
    accuracy = classifier.score(X_test, y_test)
    print(f"Accuracy with {', '.join(combo)}: {accuracy}")


Accuracy with POS_features, Location_POS: 0.768
Accuracy with POS_features, Title_POS: 0.766
Accuracy with Location_POS, POS_features: 0.768
Accuracy with Category_POS, Location_POS, Title_POS, POS_features: 0.774
Accuracy with Category_POS, POS_features: 0.762
