**Importing Packages**

In [25]:
#importing required libaries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re 
import string

import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

#keras
from keras import *
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.optimizers import  Adam
from keras import regularizers

**Malayalam Dataset**

In [26]:
#reading the Malayalam dataset
train=pd.read_csv('/content/drive/MyDrive/offensive language/Malayalam dataset/Mal_Training_data.tsv',sep='\t', index_col=[0])
test=pd.read_csv('/content/drive/MyDrive/offensive language/Malayalam dataset/mal_test_data_with_labels.tsv',sep='\t', index_col=[0])

# **Removing punctuation**

In [27]:
import string
def remove_punctuations(txt):
    text_nopunc="".join([c for c in txt if c not in string.punctuation])
    return text_nopunc

train['Text']=train['Text'].apply(lambda x: remove_punctuations(x))
train

Unnamed: 0,Text,Category
MA_YT001,Thaankal enthaan cheyyarullath😛,NOT
MA_YT002,Ee theetam WCC feminichigalude news aarkk vena...,OFF
MA_YT003,fukru nem tiktok oolakale vilich charcha nadat...,OFF
MA_YT004,Aashiq abu produce cheytharunnel ee problems u...,NOT
MA_YT005,Pennungal oru team aayal ath moonjum ennu epoo...,OFF
...,...,...
MA_YT3996,Eee parasayam thanne thettanu Ella achanmaraya...,NOT
MA_YT3997,Ente bagathum thetundh ee vazhikke veraan paad...,NOT
MA_YT3998,Kuttiye njan kettikolaam swarnam onnum venda e...,NOT
MA_YT3999,Chumma veettil irunna chakkiye trollanmaarkku ...,NOT


# **Spliting to Dev Dataset**

In [28]:
X_train, X_dev, y_train, y_dev = train_test_split(train['Text'], train['Category'], test_size=0.30, random_state=42)

X_test= test['Text']
y_test= test['Category']

# **Encoding**

In [29]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.transform(y_test)
y_dev = Encoder.transform(y_dev)

# **Long Short Term Memory(LSTM)**

In [30]:
from keras.preprocessing.text import one_hot

#use onehot in train
voc_size = 1000

train_onehot = [one_hot(words, voc_size)for words in X_train]
dev_onehot = [one_hot(words, voc_size)for words in X_dev]
test_onehot = [one_hot(words, voc_size)for words in X_test]

In [31]:
#performing pad_sequences
from keras.preprocessing.sequence import pad_sequences

sent_length=100
X_train=pad_sequences(train_onehot,padding='pre',maxlen=sent_length)
X_dev=pad_sequences(dev_onehot,padding='pre',maxlen=sent_length)
X_test = pad_sequences(test_onehot,padding='pre',maxlen=sent_length)

In [32]:
dim=40
model=Sequential()

#embedding layer
model.add(Embedding(voc_size,dim,input_length=sent_length))

#input layer
model.add(LSTM(1000, input_shape=(1000,1), return_sequences=False))

#hidded layer
model.add(Dense(500, activation='relu', kernel_regularizer=regularizers.l2(0.01) ))

#output layer
model.add(Dense(5, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False),
              metrics=['accuracy'])

#model.compile('adam','mse')

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [33]:
#summary of LSTM model
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 40)           40000     
_________________________________________________________________
lstm_2 (LSTM)                (None, 1000)              4164000   
_________________________________________________________________
dense_7 (Dense)              (None, 500)               500500    
_________________________________________________________________
dense_8 (Dense)              (None, 5)                 2505      
Total params: 4,707,005
Trainable params: 4,707,005
Non-trainable params: 0
_________________________________________________________________


In [34]:
history = model.fit(X_train, y_train, 
                    batch_size=64,
                    epochs=10, validation_data=(X_dev, y_dev)                 
                    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
#classified with test set
y_pred_test_LSTM = model.predict(X_test)

# **Neural Network**

In [36]:
#Simple Neural network
import keras
from keras.models import Sequential
from keras.layers import Dense

# Initialising
NN = Sequential()

# Adding input layer and the first hidden layer
NN.add(Dense(units = len(train.Category.value_counts()), kernel_initializer = 'uniform', activation = 'relu', input_dim = sent_length))

# Adding second hidden layer
NN.add(Dense(units = len(train.Category.value_counts()), kernel_initializer = 'uniform', activation = 'relu'))

# Adding output layer
NN.add(Dense(units = 2, kernel_initializer = 'uniform', activation = 'softmax'))

# Compiling the ANN
NN.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
NN.fit(X_train, y_train, batch_size =50 , epochs = 10)

# Predicting the Test set results
y_pred = NN.predict(X_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
