In [None]:
import numpy as np
import pandas as pd 

import matplotlib as mpl 
import matplotlib.cm as cm 
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer 

import string
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, mean_squared_error, log_loss

import sklearn.metrics as metrics

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report
from sklearn import metrics

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, SimpleRNN


from time import time

import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('/kaggle/input/dataset-3-class/TrainingSet_3_Class.csv', encoding = 'ISO-8859-1')

In [None]:
data["label"].value_counts()

In [None]:
data.shape

In [None]:
fig = go.Figure([go.Bar(x=data['label'].value_counts().index, y=data['label'].value_counts().tolist())])
fig.update_layout(
    title="Chart Evaluation",
    xaxis_title="label",
    yaxis_title="text")
fig.show()

In [None]:
data.head()

### Text Cleaning

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

stopwords = stop_words.ENGLISH_STOP_WORDS
lemmatizer = WordNetLemmatizer()

def clean(doc):
    text_no_namedentities = []
    document = nlp(doc)
    ents = [e.text for e in document.ents]
    for item in document:
        if item.text in ents:
            pass
        else:
            text_no_namedentities.append(item.text)
    doc = (" ".join(text_no_namedentities))

    doc = doc.lower().strip()
    doc = doc.replace("</br>", " ") 
    doc = doc.replace("-", " ") 
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    doc = " ".join([token for token in doc.split() if token not in stopwords])    
    doc = "".join([lemmatizer.lemmatize(word) for word in doc])
    return doc

In [None]:
data['text'] = data['text'].apply(clean)
data.head()

In [None]:
docs = list(data['text'])
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features = 20000) 
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(docs)
docs = tfidf_vectorizer_vectors.toarray()

In [None]:
X = docs 
y = data['label']
print(X.shape, y.shape)

In [None]:
SEED=123
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
vocab_size = 10000
max_length = 100
embedding_dim = 100

In [None]:
def create_cnn_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(10, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

cnn_model = create_cnn_model()
cnn_model.summary()

cnn_model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_test, y_test))

cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test, y_test)
y_pred_cnn = cnn_model.predict(X_test)
y_pred_cnn = (y_pred_cnn > 0.5).astype(int)
cnn_mse = mean_squared_error(y_test, y_pred_cnn)

print(f"CNN Accuracy: {cnn_accuracy}")
print(f"CNN Loss: {cnn_loss}")
print(f"CNN MSE: {cnn_mse}")

In [None]:
def create_rnn_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        SimpleRNN(128),
        Dense(10, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

rnn_model = create_rnn_model()
rnn_model.summary()

rnn_model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_test, y_test))

rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test, y_test)
y_pred_rnn = rnn_model.predict(X_test)
y_pred_rnn = (y_pred_rnn > 0.5).astype(int)
rnn_mse = mean_squared_error(y_test, y_pred_rnn)

print(f"RNN Accuracy: {rnn_accuracy}")
print(f"RNN Loss: {rnn_loss}")
print(f"RNN MSE: {rnn_mse}")