# NLP Assignment #2
### by Prodromos Kampouridis 

## B. Traditional Text Classification


### Import Libraries | Data


In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from time import time

# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### Answers

### 1 + 2.

#### Convert title and description to lowercase

In [2]:
train['Title'] = train['Title'].str.lower()
train['Description'] = train['Description'].str.lower()
test['Title'] = test['Title'].str.lower()
test['Description'] = test['Description'].str.lower()

#### Combine title and description into a single text column

In [3]:
train['text'] = train['Title'] +' '+ train['Description']
test['text'] = test['Title'] +' '+ test['Description']

#### Define the pipelines

In [4]:
mnb_unigram_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1, 1))),
    ('clf', MultinomialNB())
])
mnb_trigram_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(3, 3))),
    ('clf', MultinomialNB())
])
svm_unigram_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1, 1))),
    ('clf', LinearSVC(C=1))
])
svm_trigram_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(3, 3))),
    ('clf', LinearSVC(C=1))
])

pipelines = [mnb_unigram_pipeline, mnb_trigram_pipeline, svm_unigram_pipeline, svm_trigram_pipeline]
pipeline_names = ['NB (word 1-grams)', 'NB (char 3-grams)', 'SVM (word 1-grams)', 'SVM (char 3-grams)']

#### Evaluating Accuracy, Dimensionality, and Time Cost of the Text Classification Models

In [5]:
results = {'Accuracy': [], 'Dimensionality': [], 'Time Cost': []}

for pipeline, name in zip(pipelines, pipeline_names):
    # Train the model and record the time cost
    start_time = time()
    pipeline.fit(train['text'], train['Class Index'])
    end_time = time()
    # Make predictions on the test set
    preds = pipeline.predict(test['text'])
    if name == 'NB (word 1-grams)':
        mnb_unigram_preds = preds
    elif name == 'NB (char 3-grams)':
        mnb_trigram_preds = preds
    elif name == 'SVM (word 1-grams)':
        svm_unigram_preds = preds
    elif name == 'SVM (char 3-grams)':
        svm_trigram_preds = preds
    # Calculate accuracy
    accuracy = accuracy_score(test['Class Index'], preds)
    # Get dimensionality
    dimensionality = pipeline.named_steps['tfidf'].vocabulary_.__len__()
    # Get time cost
    time_cost = end_time - start_time
    # Store results
    results['Accuracy'].append(accuracy)
    results['Dimensionality'].append(dimensionality)
    results['Time Cost'].append(time_cost)

#### Create the DataFrame with the results and display it

In [6]:
results_df = pd.DataFrame(results, index=pipeline_names)
display(results_df.T)

Unnamed: 0,NB (word 1-grams),NB (char 3-grams),SVM (word 1-grams),SVM (char 3-grams)
Accuracy,0.902237,0.868684,0.919605,0.912105
Dimensionality,64999.0,31074.0,64999.0,31074.0
Time Cost,4.826115,26.547196,15.030112,35.974033


## 3.

#### Find the indices of the texts that all four models misclassify

In [7]:
misclassified_indices = [i for i in range(len(test['Class Index'])) if all([
    mnb_unigram_preds[i] != test['Class Index'][i],
    mnb_trigram_preds[i] != test['Class Index'][i],
    svm_unigram_preds[i] != test['Class Index'][i],
    svm_trigram_preds[i] != test['Class Index'][i]
])]

#### Show the total number of misclassified texts

In [8]:
print('Total number of misclassified texts:', len(misclassified_indices))

Total number of misclassified texts: 341


#### Show a text that was misclassified by all four models

In [10]:
if len(misclassified_indices) > 0:
    print('Example of misclassified text:')
    print(test.iloc[misclassified_indices[0]]['text'])
else:
    print('No texts were misclassified by all four models.')

Example of misclassified text:
rivals try to turn tables on charles schwab by michael liedtke     san francisco (ap) -- with its low prices and iconoclastic attitude, discount stock broker charles schwab corp. (sch) represented an annoying stone in wall street's wing-tipped shoes for decades...


#### Count the number of misclassified texts per category

In [11]:
category_counts = test.iloc[misclassified_indices]['Class Index'].value_counts().sort_index()
category_counts.index = category_counts.index.map({1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'})
print('Number of misclassified texts per category:')
print(category_counts)

Number of misclassified texts per category:
World       112
Sports        9
Business    135
Sci/Tech     85
Name: Class Index, dtype: int64


#### Find the most common pair of correct category - wrong prediction

In [12]:
pairs = []
for i in misclassified_indices:
    true_label = test.iloc[i]['Class Index']
    predicted_labels = [mnb_unigram_preds[i], mnb_trigram_preds[i], svm_unigram_preds[i], svm_trigram_preds[i]]
    for predicted_label in predicted_labels:
        pairs.append((true_label, predicted_label))

pairs_counts = pd.Series(pairs).value_counts()
most_common_pair = pairs_counts.index[0]
most_common_category = {1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'}[most_common_pair[0]]
most_common_wrong_prediction = {1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'}[most_common_pair[1]]
num_times_found = pairs_counts.iloc[0]
print(f"The most common pair of correct category - wrong prediction and the number of times found: ([{most_common_category}, {most_common_wrong_prediction}], {num_times_found})")

The most common pair of correct category - wrong prediction and the number of times found: ([Business, Sci/Tech], 381)
