In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import re
import gc
import os
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
import string
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

In [None]:
dir = '/content/drive/My Drive'
train = pd.read_csv(os.path.join(dir, 'train (3).csv'))
test = pd.read_csv(os.path.join(dir, 'test (3).csv'))

In [None]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r" ",text)
train['text']=train['text'].apply(lambda x:remove_url(x))
test['text']=test['text'].apply(lambda x:remove_url(x))

In [None]:
def remove_htmltags(text):
    html_tag = re.compile(r'<.*?>')
    return html_tag.sub(r' ',text)
train['text']=train['text'].apply(lambda x:remove_htmltags(x))
test['text']=test['text'].apply(lambda x:remove_htmltags(x))

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"  
                           u"\U0001F1E0-\U0001F1FF"  
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
train['text']=train['text'].apply(lambda x:remove_emoji(x))
test['text']=test['text'].apply(lambda x:remove_emoji(x))

In [None]:
def remove_punctuation(text):
    trans_table = str.maketrans('','',string.punctuation)
    return text.translate(trans_table)
train['text']=train['text'].apply(lambda x:remove_punctuation(x))
test['text']=test['text'].apply(lambda x:remove_punctuation(x))

In [None]:
!pip install pyspellchecker



In [None]:
from spellchecker import SpellChecker
spell = SpellChecker()
def correct_spellings(text):
  print('ho raha')
  corrected_text=[]
  misspelled_words = spell.unknown(text.split())
  for word in text.split():
    if word in misspelled_words:
      corrected_text.append(spell.correction(word))
    else:
      corrected_text.append(word)
  return " ".join(corrected_text)

In [None]:
train['text']=train['text'].apply(lambda x: correct_spellings(x))
test['text']=test['text'].apply(lambda x:correct_spellings(x))

In [None]:
lemmatizer=WordNetLemmatizer()
stopwords=stopwords.words('english')

In [None]:
def corpus_creation(df):
  corpus=[]
  for tweet in tqdm(df['text']):
    words = [lemmatizer.lemmatize(word.lower()) for word in tweet.split() if word not in stopwords]
    corpus.append(words)
  return corpus

In [None]:
train_corpus = corpus_creation(train)
test_corpus = corpus_creation(test)
for tweet in test_corpus:
  train_corpus.append(tweet)

100%|██████████| 7613/7613 [00:02<00:00, 2985.87it/s]
100%|██████████| 3263/3263 [00:00<00:00, 12721.94it/s]


In [None]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> wordnet
    Downloading package wordnet to /root/nltk_data...
      Unzipping corpora/wordnet.zip.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [None]:
embeddings={}
with open('/content/drive/My Drive/glove.6B.100d.txt','r') as f:
  for line in f:
    values=line.split()
    word=values[0]
    vectors=np.asarray(values[1:],dtype=np.float64)
    embeddings[word]=vectors
  f.close()

In [None]:
tokenizer = Tokenizer(oov_token='unk')
tokenizer.fit_on_texts(train_corpus)
sequences = tokenizer.texts_to_sequences(train_corpus)
padded_sequences = pad_sequences(sequences, maxlen=50, padding='pre', truncating='post')

In [None]:
word_index = tokenizer.word_index
num_words = len(word_index) +1
embedding_matrix = np.zeros((num_words,100))
for word, i in tqdm(word_index.items()):
  if i>num_words:
    continue
  else:
    emb_vec = embeddings.get(word)
    if emb_vec is not None:
      embedding_matrix[i]=emb_vec

100%|██████████| 21152/21152 [00:00<00:00, 481359.46it/s]


In [None]:
from keras.initializers import Constant

In [None]:
model = Sequential()
embedding=Embedding(input_dim=num_words, output_dim=100, embeddings_initializer=Constant(embedding_matrix), input_length=50, trainable = False)
model.add(embedding)
model.add(SpatialDropout1D(0.2))
lstm=LSTM(64, activation='tanh', recurrent_activation='tanh', use_bias=True)
lstm_seq=LSTM(64, activation='tanh', recurrent_activation='tanh', use_bias=True, return_sequences=True)
model.add(Bidirectional(lstm_seq, merge_mode='ave'))
model.add(Bidirectional(lstm, merge_mode='ave'))
model.add(Dense(1,activation="sigmoid"))
model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           2115300   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 50, 100)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 64)            84480     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                66048     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,265,893
Trainable params: 150,593
Non-trainable params: 2,115,300
_________________________________________________________________


In [None]:
X_train,X_test,y_train,y_test=train_test_split(padded_sequences[:train['target'].values.shape[0]],train['target'].values,test_size=0.15)

In [None]:
history=model.fit(X_train,y_train,batch_size=4,epochs=8,validation_data=(X_test,y_test),verbose=0)

In [None]:
sample_sub=pd.read_csv('/content/drive/My Drive/sample_submission.csv')
y_pre=model.predict(padded_sequences[train['target'].values.shape[0]:])
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('submission.csv',index=False)