In [1]:
import pandas as pd
import numpy as np

In [2]:
data = open("rehanchat.txt","r")

In [3]:
ds = data.read()

In [4]:
lst = ds.split('\n')

In [5]:
newlst = []
for i in lst:
  newlst.append(i[20:])

In [6]:
users = ['hecker 🤡: ','Beluga: ']

In [7]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
import string,re
# cleaning the text
def clean_text(text):
    non_punch = ''
    for i in text:
        if i not in string.punctuation:
            non_punch += i
    split_text = re.split('\W+',non_punch)
    stop = nltk.corpus.stopwords.words('english')
    wn = nltk.WordNetLemmatizer()
    return " ".join([wn.lemmatize(word).lower() for word in split_text])

In [9]:
dataset = {'hecker':[],'beluga':[]}
for i in newlst:
  if users[0] in i:
    text = clean_text(i[10:]).strip()
    if text:
      dataset['hecker'].append(text)
  if users[1] in i:
    text = clean_text(i[8:]).strip()
    if text:
      dataset['beluga'].append(text)


In [10]:
common_elements = set(dataset['hecker']).intersection(dataset['beluga'])

print(common_elements)

{'ha', 'nhi re', 'hai', 'nhi pata', 'kya', 'kaisa hai', 'ooh', 'thik hai', 'choro', 'oh', 'abe', 'ruk', 'sunn', 'kyu', 'nhi', 'anyone who want to be volunteer for tomorrow quiz competition quiz init', 'bye', 'yes', 'abhi nhi', 'kya tha', 'okay', 'na', 'anyone who want to be volanteer for tomorrow quiz competition quiz init', 'gm', 'ohh', 'bata be', 'adi', 'bro', 'null', 'kya pata', 'media omitted', 'ok', 'bata', 'hello', 'sorry bro', 'read kr le', 'nhi be', 'tum bol na', 'aisa', 'online', 'oo', 'lol', 'nhi chor'}


In [11]:
import collections
hecker_count = collections.Counter(dataset['hecker'])
beluga_count = collections.Counter(dataset['beluga'])

In [12]:
sorted_hecker = dict(sorted(hecker_count.items(), key=lambda x: x[1], reverse=True))
sorted_beluga = dict(sorted(beluga_count.items(), key=lambda x: x[1], reverse=True))

In [13]:
print(len(sorted_hecker))
print(len(sorted_beluga))

1482
2116


In [14]:
beluga_message = list(sorted_beluga.keys())
hecker_message = list(sorted_hecker.keys())

In [15]:
for d in common_elements:
  hecker_message.remove(d)
  beluga_message.remove(d)

In [16]:
labels = list(np.full(len(beluga_message),1))+list(np.full(len(hecker_message),0))

In [17]:
dict_set = {'text':beluga_message+hecker_message,'label':labels}

In [18]:
df = pd.DataFrame(dict_set)

In [22]:
df.head()

Unnamed: 0,text,label
0,han,1
1,you deleted this message,1
2,acha,1
3,cheee,1
4,haan,1


In [23]:
df.iloc[100:105]

Unnamed: 0,text,label
100,mey parking taraf hu,1
101,aa tuummmm,1
102,correct answer hai 110 last walle question ka ...,1
103,pata nhi kese mey abhi online calculator se ch...,1
104,tum hara v 110 hai na,1


In [24]:
dataset = df.sample(frac=1)

In [25]:
dataset.tail()

Unnamed: 0,text,label
2740,kya kya bol rha mere baare me,0
933,pata nhi yaad nhi hai,1
2664,nhi likha hua hai,0
2718,haa like kiye,0
1332,abe chuttti hai kya aaj,1


In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(dataset["text"], dataset["label"], test_size=0.33, random_state=42)


In [28]:
vocab_size = 1000
embedding_dim = 16
max_length = 100
trunc_type = "post"
padding_type = "post"
oov_tok = ""

In [29]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
tokenizer = Tokenizer(num_words = vocab_size,oov_token=oov_tok)

In [31]:
from pygments import token
tokenizer.fit_on_texts(np.array(X_train))
word_index = tokenizer.word_index
     

sequence = tokenizer.texts_to_sequences(np.array(X_train)) 
padded = pad_sequences(sequence,maxlen=max_length,padding=padding_type
                       ,truncating=trunc_type)
     

test_seq = tokenizer.texts_to_sequences(np.array(X_test)) 
test_pad = pad_sequences(test_seq,maxlen=max_length,padding=padding_type
                       ,truncating=trunc_type)

In [32]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6,activation="relu"),
    tf.keras.layers.Dense(1,activation="sigmoid")
])
     

model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
     


In [33]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           16000     
                                                                 
 flatten (Flatten)           (None, 1600)              0         
                                                                 
 dense (Dense)               (None, 6)                 9606      
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 25,613
Trainable params: 25,613
Non-trainable params: 0
_________________________________________________________________


In [34]:
num_epochs = 10
model.fit(padded,np.array(y_train),
          epochs=num_epochs,
          validation_data=(test_pad,np.array(y_test)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ffa652a5fd0>

In [35]:
def predict(text):
  prod_seq = tokenizer.texts_to_sequences(np.array([text])) 
  prod_pad = pad_sequences(prod_seq,maxlen=max_length,padding=padding_type
                       ,truncating=trunc_type)  
  pred = round(model.predict(prod_pad)[0][0])
  print(text+" : ","Beluga Message" if pred == 1 else "Hecker Message")

In [36]:
predict("Mene code remove Kiya hai bas.. aur kuch text change")

Mene code remove Kiya hai bas.. aur kuch text change :  Beluga Message
