In [1]:
# Task
# https://www.hackerrank.com/challenges/document-classification/problem

# Useful links
# https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py
# https://www.docsumo.com/blog/document-classification

# !/bin/python3

import math
import os
import random
import re
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [142]:
# Settings
nrFeatures = 100 # If the text is too short it won't have enough features, maybe can set rest to 0?

In [143]:
with open("trainingdata.txt", "r") as file1:
    content = file1.readlines()

N = content[0] # Training data size
del content[0] # Remove from read data   

In [144]:
!pip install nltk



In [145]:
import nltk # natural language processing
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\00350974\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\00350974\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [146]:
# Move the data into X (feature matrix) and y (class array)

from nltk.tokenize import sent_tokenize #word_tokenize or sent_tokenize (best for longer texts)

from nltk.stem import PorterStemmer
stemmer = PorterStemmer() # Reduces the words to their base form

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() # Reduces inflection from words - similar to stemming
y = []
X = []
for line in content:
    y.append(int(line[0]))
    processed_line = sent_tokenize(line[2:]) # Tokenize the words. Other preprocesses include stop word removal, POS tagging and chunking
#     processed_line = [stemmer.stem(i) for i in processed_line] # Takes long time, use if needed
    processed_line = [lemmatizer.lemmatize(i) for i in processed_line] 
    X.append(processed_line[0])

df = pd.DataFrame(zip(X, y),
           columns =['Text', 'Class'])
df.head()

Unnamed: 0,Text,Class
0,champion products ch approves stock split cham...,1
1,computer terminal systems cpml completes sale ...,2
2,cobanco inc cbco year net shr cts vs dlrs net ...,1
3,am international inc am nd qtr jan oper shr lo...,1
4,brown forman inc bfd th qtr net shr one dlr vs...,1


In [147]:
# # Visualize the words with a word cloud
# # !pip install WordCloud
# import wordcloud
# wordcloud_pos = wordcloud(width=200,
#                          height=500,
#                          max_font_size=150).generate(print(df[df['Class'] == 1]))

# plt.figure(figsize=(15, 10))
# plt.imshow(interpolation = "bilinear")
# plt.axis("off")
# plt.title(f"Most common words associated with non-toxic comment", size=20)
# plt.show()

In [148]:
from sklearn.preprocessing import MinMaxScaler, Normalizer 
from sklearn.feature_extraction.text import TfidfVectorizer # term frequency
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.filterwarnings('ignore')


In [183]:
X_train, X_test, y_train, y_test = train_test_split(df["Text"], df["Class"], test_size=0.10)

# https://realpython.com/python-keras-text-classification/

# - Bag Of Words -
# Create features from the data
tfid = TfidfVectorizer(lowercase=False, max_features=nrFeatures, stop_words ="english")  # term frequency
X_train_tfidf = tfid.fit_transform(X_train).toarray()
X_test_tfidf = tfid.transform(X_test).toarray()

# Normalize the vectors - probably not needed
norm_TFIDF = Normalizer(copy=False)
X_norm_train_tfidf = norm_TFIDF.fit_transform(X_train_tfidf)
X_norm_test_tfidf = norm_TFIDF.transform(X_test_tfidf)

SyntaxError: invalid syntax (2174299837.py, line 5)

In [186]:
print(X_norm_train_tfidf.shape)

(4936, 100)


In [170]:
y_test.head()

4196    1
3352    1
253     2
2279    2
4714    8
Name: Class, dtype: int64

In [171]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_norm_train_tfidf, y_train)
y_pred = knn_model.predict(X_norm_test_tfidf)

In [172]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.96      0.96      0.96       285
           2       0.90      0.91      0.90       161
           3       0.79      0.90      0.84        30
           4       0.60      0.25      0.35        12
           5       0.00      0.00      0.00         4
           6       0.95      0.95      0.95        21
           7       0.74      0.93      0.82        15
           8       0.70      0.67      0.68        21

    accuracy                           0.91       549
   macro avg       0.71      0.70      0.69       549
weighted avg       0.90      0.91      0.90       549



In [173]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_norm_train_tfidf, y_train)
y_pred = nb_model.predict(X_norm_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.89      0.94       285
           2       0.86      0.63      0.73       161
           3       0.67      0.40      0.50        30
           4       0.11      0.67      0.19        12
           5       0.05      0.25      0.08         4
           6       0.50      0.52      0.51        21
           7       0.48      0.80      0.60        15
           8       0.56      0.43      0.49        21

    accuracy                           0.75       549
   macro avg       0.53      0.57      0.51       549
weighted avg       0.86      0.75      0.79       549



In [174]:
from sklearn.svm import SVC # Support vector classifier
svm_model = SVC()
svm_model.fit(X_norm_train_tfidf, y_train)
y_pred = svm_model.predict(X_norm_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.98      0.96      0.97       285
           2       0.90      0.96      0.92       161
           3       0.90      0.93      0.92        30
           4       0.83      0.42      0.56        12
           5       0.50      0.25      0.33         4
           6       0.87      0.95      0.91        21
           7       0.76      0.87      0.81        15
           8       0.84      0.76      0.80        21

    accuracy                           0.93       549
   macro avg       0.82      0.76      0.78       549
weighted avg       0.93      0.93      0.93       549



In [175]:
ml_model = svm_model

# Analyse training - only works for linear model

In [181]:
def plot_feature_effects(X_norm_train_tfidf, tfid, ml_model):
    # learned coefficients weighted by frequency of appearance
    average_feature_effects = ml_model.coef_ * np.asarray(X_norm_train_tfidf.mean(axis=0)).ravel()
    feature_names = tfid.get_feature_names_out()
    
    for i, label in enumerate(X_norm_train_tfidf.target_names):
        top5 = np.argsort(average_feature_effects[i])[-5:][::-1]
        if i == 0:
            top = pd.DataFrame(feature_names[top5], columns=[label])
            top_indices = top5
        else:
            top[label] = feature_names[top5]
            top_indices = np.concatenate((top_indices, top5), axis=None)
    top_indices = np.unique(top_indices)
    predictive_words = feature_names[top_indices]

    # plot feature effects
    bar_size = 0.25
    padding = 0.75
    y_locs = np.arange(len(top_indices)) * (4 * bar_size + padding)

    fig, ax = plt.subplots(figsize=(10, 8))
    for i, label in enumerate(target_names):
        ax.barh(
            y_locs + (i - 2) * bar_size,
            average_feature_effects[i, top_indices],
            height=bar_size,
            label=label,
        )
    ax.set(
        yticks=y_locs,
        yticklabels=predictive_words,
        ylim=[
            0 - 4 * bar_size,
            len(top_indices) * (4 * bar_size + padding) - 4 * bar_size,
        ],
    )
    ax.legend(loc="lower right")

    print("top 5 keywords per class:")
    print(top)

    return ax


_ = plot_feature_effects(X_norm_train_tfidf, tfid, ml_model).set_title("Average feature effect on the original data")

AttributeError: coef_ is only available when using a linear kernel

# CLASSIFY AN INPUT 

In [192]:
def main():
    warnings.filterwarnings('ignore')
    # Read input from stdio
#     input = sys.stdin.readline
    nrClassifications = 1 #int(input()) # first line says how many more lines there are

    for i in range(nrClassifications):
        line = input()
        X = []
        stemmer = PorterStemmer() # Reduces the words to their base form
        lemmatizer = WordNetLemmatizer() # Reduces inflection from words - similar to stemming

        processed_line = sent_tokenize(line) # Tokenize the words. Other preprocesses include stop word removal, POS tagging and chunking
    #     processed_line = [stemmer.stem(i) for i in processed_line] # Takes long time, use if needed
        processed_line = [lemmatizer.lemmatize(i) for i in processed_line] 
        X.append(processed_line[0])
        
        # VS CODE STILL HAS ORIGINAL CODE, HOW TO DEAL WITH MISSING FEATURES?
        # Create features from the data
        tfid = TfidfVectorizer(lowercase=False, max_features=nrFeatures, stop_words ="english")  # term frequency, can remove stop words here
        print(tfid)
        
        # Create a vector of zeros with the same number of features as the model's training data
        zero_vector = np.zeros([1, nrFeatures])

        # Get feature values from input and update the corresponding positions in the zero vector
        input_vectors_tfidf = zero_vector.copy()
        features = tfid.fit_transform(X).toarray()
        print(enumerate(features[0]))        
        for i, value in enumerate(features[0]):
            input_vectors_tfidf[1, i] = value
        
#         input_vectors_tfidf = tfid.fit_transform(X).toarray() # This variable has too few features
#         print(tfid.get_feature_names_out())
        
        # Normalize the vectors
        norm_TFIDF = Normalizer(copy=False)
        norm_input_tfidf = norm_TFIDF.fit_transform(input_vectors_tfidf)
        #ADD FUNCTION TO PROCESS DATA, STR->VECTORIZED
        print("Class is: ", ml_model.predict(norm_input_tfidf))
        
main()

brooklyn union gas co bu sets payout qtrly div cts vs cts prior pay may one record april reuter 
TfidfVectorizer(lowercase=False, max_features=100, stop_words='english')
<enumerate object at 0x000002E48D98FE20>


IndexError: index 1 is out of bounds for axis 0 with size 1

NameError: name 'features' is not defined