In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

folder = "/content/train_set"

In [None]:
tmp_list = []
for filename in os.listdir(folder):
    path = os.path.join(folder, filename)
    with open(path, encoding="utf8", errors='ignore') as f:
        lines = list(line for line in (l.strip() for l in f) if line)
        name = filename[:len(filename) - 4]
        name = [name for _ in lines]
    tmp_list.extend(list(zip(name, lines)))
df = pd.DataFrame(tmp_list, columns=["label", "text"])
df.head()
df[df['text'] == ''].index

In [None]:
df.label.value_counts()

In [None]:
import spacy
import string 

import spacy
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    doc = nlp(text)
    doc = [t.lemma_.lower() for t in doc if len(t)>2]
    doc = " ".join([char for char in doc if char not in string.punctuation])
    return doc

cleaned_text = []
for i in range (df.shape[0]):
    cleaned_text.append(clean_text(df.text.iloc[i]))

In [None]:
mapping = pd.read_pickle('/content/id2label_final.pkl', compression='infer', storage_options=None)

In [None]:
df["label"].replace(mapping, inplace=True)

In [None]:
df['text_clean'] = cleaned_text

In [None]:
df.to_csv('copy.csv')

In [None]:
docs = list(df.text_clean.astype('str'))
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features = 20000) 
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(docs)
docs = tfidf_vectorizer_vectors.toarray()

In [None]:
import pickle

with open('vocabulary.pkl', 'wb') as fp:
    pickle.dump(tfidf_vectorizer.vocabulary_, fp)
    print('dictionary saved successfully to file')

In [None]:
X = docs 
y = df.label
print(X.shape, y.shape)

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure([go.Bar(x=y.value_counts().index, y=y.value_counts().tolist())])
fig.update_layout(
    title="Values in each label",
    xaxis_title="Label",
    yaxis_title="Values")
fig.show()

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=148260, stratify=y)

In [None]:
lr = LogisticRegression(random_state=148260)
lr.fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
print("\nTraining Accuracy score:", accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:", accuracy_score(y_test, y_pred_test))

In [None]:
y_proba_test = lr.predict_proba(X_test)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_proba_test, multi_class='ovr')

In [None]:
import numpy as np
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight

y_train_ohe = pd.get_dummies(y_train, sparse=True)
y_test_ohe = pd.get_dummies(y_test, sparse=True)

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=20000))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(21, activation='softmax'))

model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='auto', baseline=None, restore_best_weights=True)

model.summary()

history = model.fit(X_train, y_train_ohe, batch_size = 32, epochs = 100, callbacks=[es], validation_split=0.2)

score = model.evaluate(X_test, y_test_ohe, verbose=0)

print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
y_pred_ohe = model.predict(X_test) 
predictions = np.argmax(y_pred_ohe,axis=1)

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix 
import matplotlib.pyplot as plt 

In [None]:
names = list(mapping.keys())

In [None]:
cm = confusion_matrix(y_test, predictions)
cm_matrix = pd.DataFrame(data=cm, columns=names, index=names)
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='BuPu')
plt.show()

In [None]:
roc_auc_score(y_test, y_pred_ohe, multi_class='ovr')

In [None]:
!mkdir -p saved_model
model.save('saved_model/my_model')