## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import string
import pickle
from collections import Counter, OrderedDict
pd.options.mode.chained_assignment = None
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import warnings
warnings.filterwarnings('ignore')

import stanfordnlp
nlp = stanfordnlp.Pipeline(processors='tokenize,lemma',lang="hi")

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': 'C:\\Users\\SHUBHAM\\stanfordnlp_resources\\hi_hdtb_models\\hi_hdtb_tokenizer.pt', 'lang': 'hi', 'shorthand': 'hi_hdtb', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': 'C:\\Users\\SHUBHAM\\stanfordnlp_resources\\hi_hdtb_models\\hi_hdtb_lemmatizer.pt', 'lang': 'hi', 'shorthand': 'hi_hdtb', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
Done loading processors!
---


## Loading Vectorizers and Models

In [24]:
# TF-IDF
def my_tokenizer(s):
    return s.split(' ')
tfidf = pickle.load(open('tfidf.pkl','rb'))

# Count Vectorizer
vectorizer = pickle.load(open('../vectorizer.pkl','rb'))

# Random Forest
Rft = pickle.load(open('../RandomForestT.sav','rb'))
Rfc = pickle.load(open('../RandomForestC.sav','rb'))

# Tokenizer
tokenizer = pickle.load(open('tokenizer.pkl','rb'))

# LSTM
model = load_model('LSTM.h5')

## Data Pre-Processing

In [28]:
# generating stopwords
def gen_stopword():
    st=pd.read_csv('hindi_stopwords.txt',sep='\n')
    stopwords=[]
    for i in range(len(st)):
        stopwords.append(st.loc[i, 'Stopwords'].strip())
    return stopwords
# lemmatization function
def hi_lemma(w):
    try:
        doc = nlp(w)
        tmp = [word.lemma for sent in doc.sentences for word in sent.words]
        return tmp[0]
    except:
        return w
arr = []
def preprocess(text):
    # removing url links
    func = lambda x: re.sub(r'http\S+', '', x)
    text = func(text)
    func = lambda x: re.sub(r'www\.[a-z]?\.?(com)+|[a-z]+\.(com)', '', x)
    text = func(text)
    arr.append(text)
    # removing new lines and tabs
    func = lambda x: re.sub(r"[\t\r]+", '', x)
    text = func(text)
    arr.append(text)
    # removing @mention
    func = lambda x: re.sub(r'@[\w]*', '', x)
    text = func(text)
    arr.append(text)
    # removing all special characters
    func = lambda x: re.sub(r"[`'''`,~,!,@,#,$,%,^,&,*,(,),_,-,+,=,{,[,},},|,\,:,;,\",',<,,,>,.,?,/'''`\nред]", '', x)
    text = func(text)
    arr.append(text)
    # removing emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    func = lambda x: emoji_pattern.sub(r'', x)
    text = func(text)
    arr.append(text)
    # removing all remaining characters that aren't hindi devanagari characters or white space
    func = lambda x: re.sub(r"[^рдА-ре┐\s]", '', x)
    text = func(text)
    arr.append(text)
    # removing stopwords
    stopwords = gen_stopword()
    func = lambda x: ' '.join([word for word in x.split() if word not in (stopwords)])
    text = func(text)
    arr.append(text)
    # tokenization
    func = lambda x: x.split(' ')
    text = func(text)
    arr.append(text)
    # lemmatization
    func = lambda x: [hi_lemma(y) for y in x]
    text = func(text)
    arr.append(text)
    # remove repeated tokens
    func = lambda x: list(OrderedDict.fromkeys(x))
    text = func(text)
    arr.append(text)
    # generating clean sentence
    sentence = ' '.join(r for r in text)
    arr.append(sentence)

    return sentence

In [29]:
text = """@priya1 рдкреНрд░рд┐рдпрд╛ рдмрдбреА рдмреЗрд╡рдХреВрдл рдФрд░ рдмрджреНрд╕реВрд░рдд рд╣реИ #ugly!!!, ЁЯдУЁЯШДЁЯШЖЁЯШВЁЯдгЁЯШВЁЯдгЁЯШВЁЯдг
https://t.co/BFRtZXCXrp"""
print("Original Text: ",text,end='\n\n')
text = [preprocess(text)]
for i in arr:
    print(i,end='\n\n')

Original Text:  @priya1 рдкреНрд░рд┐рдпрд╛ рдмрдбреА рдмреЗрд╡рдХреВрдл рдФрд░ рдмрджреНрд╕реВрд░рдд рд╣реИ #ugly!!!, ЁЯдУЁЯШДЁЯШЖЁЯШВЁЯдгЁЯШВЁЯдгЁЯШВЁЯдг
https://t.co/BFRtZXCXrp

@priya1 рдкреНрд░рд┐рдпрд╛ рдмрдбреА рдмреЗрд╡рдХреВрдл рдФрд░ рдмрджреНрд╕реВрд░рдд рд╣реИ #ugly!!!, ЁЯдУЁЯШДЁЯШЖЁЯШВЁЯдгЁЯШВЁЯдгЁЯШВЁЯдг


@priya1 рдкреНрд░рд┐рдпрд╛ рдмрдбреА рдмреЗрд╡рдХреВрдл рдФрд░ рдмрджреНрд╕реВрд░рдд рд╣реИ #ugly!!!, ЁЯдУЁЯШДЁЯШЖЁЯШВЁЯдгЁЯШВЁЯдгЁЯШВЁЯдг


 рдкреНрд░рд┐рдпрд╛ рдмрдбреА рдмреЗрд╡рдХреВрдл рдФрд░ рдмрджреНрд╕реВрд░рдд рд╣реИ #ugly!!!, ЁЯдУЁЯШДЁЯШЖЁЯШВЁЯдгЁЯШВЁЯдгЁЯШВЁЯдг


 рдкреНрд░рд┐рдпрд╛ рдмрдбреА рдмреЗрд╡рдХреВрдл рдФрд░ рдмрджреНрд╕реВрд░рдд рд╣реИ ugly ЁЯдУЁЯШДЁЯШЖЁЯШВЁЯдгЁЯШВЁЯдгЁЯШВЁЯдг

 рдкреНрд░рд┐рдпрд╛ рдмрдбреА рдмреЗрд╡рдХреВрдл рдФрд░ рдмрджреНрд╕реВрд░рдд рд╣реИ ugly 

 рдкреНрд░рд┐рдпрд╛ рдмрдбреА рдмреЗрд╡рдХреВрдл рдФрд░ рдмрджреНрд╕реВрд░рдд рд╣реИ  

рдкреНрд░рд┐рдпрд╛ рдмрдбреА рдмреЗрд╡рдХреВрдл рдмрджреНрд╕реВрд░рдд

['рдкреНрд░рд┐рдпрд╛', 'р

In [30]:
text1 = tfidf.transform(text)
print(text1)

  (0, 8387)	1.0


In [31]:
Rft.predict_proba(text1)

[array([[0.337, 0.663]]), array([[0.793, 0.207]]), array([[1., 0.]])]

In [32]:
text2 = vectorizer.transform(text)
print(text2)

  (0, 8387)	1


In [33]:
Rfc.predict_proba(text2)

[array([[0.266, 0.734]]), array([[0.864, 0.136]]), array([[1., 0.]])]

In [34]:
text3 = tokenizer.texts_to_sequences(text)
text3 = pad_sequences(text3, maxlen=80, padding="pre", truncating="pre")
print(text3)

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0 1225]]


In [35]:
scores = model.predict(text3)
print('Mild:',round(scores[0][0]*100,1))
print('Moderate:',round(scores[0][1]*100,1))
print('Severe:',round(scores[0][2]*100,1))

Mild: 97.4
Moderate: 5.7
Severe: 0.8
