In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df = pd.read_pickle("df_combo_filtered.pkl")
df['text'] = df['tokenized_words'].apply(lambda x: ' '.join(x))
df['Code_label'] = pd.factorize(df.Code)[0]

## TF-IDF

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['Code_label'], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = RandomForestClassifier(random_state=42)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

## BOW

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['Code_label'], test_size=0.2, random_state=42
)

vectorizer = CountVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = RandomForestClassifier(random_state=42)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Neural Networks for above encoding

In [None]:
y_train = to_categorical(train_labels, num_classes=num_classes)
y_val = to_categorical(val_labels, num_classes=num_classes)

keras_model = Sequential([
    Dense(256, activation="relu", input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(128, activation="relu"),
    Dropout(0.3),
    Dense(num_classes, activation="softmax")
])

keras_model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

history = keras_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32
)

val_predictions = keras_model.predict(X_val)
val_predictions_labels = val_predictions.argmax(axis=1)

print("Accuracy:", accuracy_score(val_labels, val_predictions_labels))
print(classification_report(val_labels, val_predictions_labels, target_names=label_encoder.classes_))