In [1]:
import pandas as pd
import numpy as np
import re
import string
import pickle
from collections import Counter, OrderedDict
pd.options.mode.chained_assignment = None

#import stanza
#nlp = stanza.Pipeline(processors='tokenize,lemma',lang="hi")

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# Dataset import
df = pd.read_csv('CommentsDataset.csv')

In [None]:
# generating stopwords
def gen_stopword():
    st=pd.read_csv('hindi_stopwords.txt',sep='\n')
    stopwords=[]
    for i in range(len(st)):
        stopwords.append(st.loc[i, 'Stopwords'].strip())
    return stopwords

# lemmatization function
def hi_lemma(w):
    try:
        doc = nlp(w)
        tmp = [word.lemma for sent in doc.sentences for word in sent.words]
        return tmp[0]
    except:
        return w

def data_pre_processing(df_clean):
    # removing url links
    df_clean.Post = df_clean.Post.apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
    df_clean.Post = df_clean.Post.apply(lambda x: re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', x))
    # removing @mention
    df_clean.Post = df_clean.Post.apply(lambda x: re.sub(r'@[\w]*', '', x))
    # removing all remaining characters that aren't hindi devanagari characters or white space
    df_clean.Post = df_clean.Post.apply(lambda x: re.sub(r"[^ऀ-ॿ\s]", '', x))
    # removing all special characters
    df_clean.Post = df_clean.Post.apply(lambda x: re.sub(r"[`'''`,~,!,@,#,$,%,^,&,*,(,),_,-,+,=,{,[,},},|,\,:,;,\",',<,,,>,.,?,/'''`\n।]", '', x))
    # removing emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    df_clean.Post = df_clean.Post.apply(lambda x: emoji_pattern.sub(r'', x))
    
    # removing stopwords
    stopwords = gen_stopword()
    df_clean.Post = df_clean.Post.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
    
    # tokenization
    df_clean['token'] = df_clean.Post.apply(lambda x: x.split())
    
    # lemmatization
    df_clean['lemma_token'] = df_clean.token.apply(lambda x: [hi_lemma(y) for y in x])
    
    # remove repeated tokens
    df_clean['lemma_token'] = df_clean.lemma_token.apply(lambda x: list(OrderedDict.fromkeys(x)))
    
    df_clean['sentence'] = [' '.join(r) for r in df_clean['lemma_token'].values]
    
    return df_clean

df = data_pre_processing(df)
df

In [3]:
df = pd.read_pickle('./Toxic Comment Classification/preprocessed_dataset.pkl')
df

Unnamed: 0,sentence,token,mild,moderate,severe
0,मैं देश हिंदु निराला पक्का राम भक्त बाबर साला ...,"[मैं, देश, हिंदु, निराला, पक्का, राम, भक्त, बा...",0,1,0
1,सरकार हमेशा किसान कमाई बढ़ा नईनई स्कीम ला रह त...,"[सरकार, हमेशा, किसान, कमाई, बढ़ा, नईनई, स्कीम,...",0,0,0
2,सुशांत बिजनेस डील जून वह दीपेश हत्या दिन क्यों...,"[सुशांत, बिजनेस, डील, जून, वह, दीपेश, हत्या, द...",0,0,1
3,साला जेएनयू छाप कमिना लोग हिन्दु संविधान सब बर...,"[साला, जेएनयू, छाप, कमिना, लोग, हिन्दु, संविधा...",0,1,0
4,अनलॉक गाइडलाइन्स जारी सितंबर देशभर मेट्रो सेवा...,"[अनलॉक, गाइडलाइन्स, जारी, सितंबर, देशभर, मेट्र...",0,0,0
...,...,...,...,...,...
7439,सोनू सूद प्रेरणा ले आदिवासी वह सरकार सक पूरा स...,"[सोनू, सूद, प्रेरणा, ले, आदिवासी, वह, सरकार, स...",0,0,0
7440,उमर खालिद लगा कपिल मिश्रा शान्ति नोबल पुरस्कार...,"[उमर, खालिद, लगा, कपिल, मिश्रा, शान्ति, नोबल, ...",1,0,0
7441,पप्पू कमीना,"[पप्पू, कमीना]",0,1,0
7442,मदर टेरेसा जीवन पीड़ित सेवा अर्पित सौभाग्यशाली...,"[मदर, टेरेसा, जीवन, पीड़ित, सेवा, अर्पित, सौभा...",0,0,0


In [4]:
X = df.sentence
y = df.drop(['sentence','token'],axis=1)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def gen_stopword():
    st=pd.read_csv('hindi_stopwords.txt',sep='\n')
    stopwords=[]
    for i in range(len(st)):
        stopwords.append(st.loc[i, 'Stopwords'].strip())
    return stopwords
def my_tokenizer(s):
    return s.split(' ')
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=2,ngram_range=(1,2),encoding='ISCII',tokenizer=my_tokenizer,stop_words=gen_stopword())
X_train = tfidf.fit_transform(X_train).toarray()
pickle.dump(tfidf,"")
X_test = tfidf.transform(X_test)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=2,ngram_range=(1,2),encoding='ISCII',tokenizer=my_tokenizer,stop_words=gen_stopword())
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
Rf = RandomForestClassifier(n_estimators=100, random_state=42)
categories = ['mild','moderate','severe']
for l in categories:
    Rf.fit(X_train, y_train[l])
    y_pred = Rf.predict(X_test)
    print(classification_report(y_test[l],y_pred))

In [None]:
def counter_word(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count
counter = counter_word(X_train)
num_words = len(counter)
max_length = 80

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [None]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)

In [None]:
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(train_sequences, maxlen=max_length, padding="pre", truncating="pre")

In [None]:
test_sequences = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(test_sequences, maxlen=max_length, padding="pre", truncating="pre")

In [None]:
model = Sequential()
model.add(Embedding(num_words, 32, input_length=max_length))
model.add(LSTM(64))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
categories = ['mild','moderate','severe']
for l in categories:
  history = model.fit(X_train, y_train[l], epochs=20, batch_size=32)
  y_pred = model.predict(X_test)
  y_pred = y_pred[:,0]
  y_pred = np.round(y_pred)
  print(classification_report(y_test[l],y_pred))

In [None]:
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)