In [None]:
#Importing the libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
#Download the following modules once
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
#Importing the training set
train_data = pd.read_excel("/content/Data_Train.xlsx")

#Printing the top 5 rows
print(train_data.head(5))

                                               STORY  SECTION
0  But the most painful was the huge reversal in ...        3
1  How formidable is the opposition alliance amon...        0
2  Most Asian currencies were trading lower today...        3
3  If you want to answer any question, click on ‘...        1
4  In global markets, gold prices edged up today ...        3


In [None]:
#Printing the dataset info
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7628 entries, 0 to 7627
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   STORY    7628 non-null   object
 1   SECTION  7628 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.3+ KB
None


In [None]:
#Printing the shape of the dataset
print(train_data.shape)

Out:(7628, 2)

#Printing the group by description of each category
train_data.groupby("SECTION").describe()

(7628, 2)


Unnamed: 0_level_0,STORY,STORY,STORY,STORY
Unnamed: 0_level_1,count,unique,top,freq
SECTION,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1686,1673,This story has been published from a wire agen...,4
1,2772,2731,This story has been published from a wire agen...,13
2,1924,1914,We will leave no stone unturned to make the au...,3
3,1246,1233,This story has been published from a wire agen...,11


In [None]:
# Data Cleaning 
#Removing duplicates to avoid overfitting
train_data.drop_duplicates(inplace = True)

#A punctuations string for reference (added other valid characters from the dataset)
all_punctuations = string.punctuation + '‘’,:”][],' 

#Method to remove punctuation marks from the data
def punc_remover(raw_text):
  no_punct = "".join([i for i in raw_text if i not in all_punctuations])
  return no_punct

#Method to remove stopwords from the data
def stopword_remover(no_punc_text):
  words = no_punc_text.split()
  no_stp_words = " ".join([i for i in words if i not in stopwords.words('english')])
  return no_stp_words

#Method to lemmatize the words in the data
lemmer = nltk.stem.WordNetLemmatizer()
def lem(words):
  return " ".join([lemmer.lemmatize(word,'v') for word in words.split()])

#Method to perform a complete cleaning
def text_cleaner(raw):
  cleaned_text = stopword_remover(punc_remover(raw))
  return lem(cleaned_text)

#Testing the cleaner method
text_cleaner("Hi!, this is a sample text to test the text cleaner method. Removes *@!#special characters%$^* and stopwords. And lemmatizes, go, going - run, ran, running")

Out: 'Hi sample text test text cleaner method Removes special character stopwords And lemmatizes go go run run run'

#Applying the cleaner method to the entire data
train_data['CLEAN_STORY'] = train_data['STORY'].apply(text_cleaner)

#Checking the new dataset
print(train_data.values) 

[['But the most painful was the huge reversal in fee income, unheard of among private sector lenders. Essentially, it means that Yes Bank took it for granted that fees on structured loan deals will be paid and accounted for upfront on its books. As borrowers turned defaulters, the fees tied to these loan deals fell off the cracks. Gill has now vowed to shift to a safer accounting practice of amortizing fee income rather than booking these upfront.\n\n\nGill’s move to mend past ways means that there will be no nasty surprises in the future. This is good news considering that investors love a clean image and loathe uncertainties.\n\n\nBut there is no gain without pain and the promise of a strong and stable balance sheet comes with some sacrifices as well. Investors will have to give up the hopes of phenomenal growth, a promise made by Kapoor.'
  3
  'But painful huge reversal fee income unheard among private sector lenders Essentially mean Yes Bank take grant fee structure loan deal pay 

In [None]:
#Data Preprocessing: Count Vectors and TF-IDF Vectors
#Creating Count vectors
#Importing sklearn’s Countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Creating a bag-of-words dictionary of words from the data
bow_dictionary = CountVectorizer().fit(train_data['CLEAN_STORY'])

#Total number of words in the bow_dictionary
len(bow_dictionary.vocabulary_)




35189

In [None]:
#Using the bow_dictionary to create count vectors for the cleaned data.
bow = bow_dictionary.transform(train_data['CLEAN_STORY'])

#Printing the shape of the bag of words model
print(bow.shape)

(7551, 35189)


In [None]:
#Creating TF-IDF Vectors
#Importing TfidfTransformer from sklearn
from sklearn.feature_extraction.text import TfidfTransformer

#Fitting the bag of words data to the TF-IDF transformer
tfidf_transformer = TfidfTransformer().fit(bow)

#Transforming the bag of words model to TF-IDF vectors
storytfidf = tfidf_transformer.transform(bow)

In [None]:
#Training The Classifier
#Creating a Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

#Fitting the training data to the classifier
classifier = MultinomialNB().fit(storytfidf, train_data['SECTION'])

In [109]:
#Predicting For The Test Set
#Importing and cleaning the test data
test_data = pd.read_excel("/content/Data_Test.xlsx")
test_data['CLEAN_STORY'] = test_data['STORY'].apply(text_cleaner)

#Printing the cleaned data
print(test_data.values)

[['2019 will see gadgets like gaming smartphones and wearable medical devices lifting the user experience to a whole new level\n\n\nmint-india-wire consumer technologyconsumer technology trends in New Yeartech gadgetsFoldable phonesgaming smartphoneswearable medical devicestechnology\n\n\nNew Delhi: Gadgets have become an integral part of our lives with most of us relying on some form of factor to communicate, commute, work, be informed or entertained. Year 2019 will see some gadgets lifting the user experience to a whole new level. Here’s what we can expect to see:\n\n\nSmartphones with foldable screens: Foldable phones are finally moving from the concept stage to commercial launches. They are made up of organic light-emitting diode (OLED) panels with higher plastic substrates, allowing them to be bent without damage.\n\n\nUS-based display maker Royole Corp’s foldable phone, FlexPai, has already arrived in select markets, while Samsung’s unnamed foldable phone is expected sometime nex

In [None]:
#Creating A Pipeline To Pre-Process The Data & Initialise The Classifier
#Importing the Pipeline module from sklearn
from sklearn.pipeline import Pipeline

#Initializing the pipeline with necessary transformations and the required classifier
pipe = Pipeline([
('bow', CountVectorizer()),
('tfidf', TfidfTransformer()),
('classifier', MultinomialNB())])

#Fitting the training data to the pipeline
pipe.fit(train_data['CLEAN_STORY'], train_data['SECTION'])

#Predicting the SECTION
test_preds_mnb = pipe.predict(test_data['CLEAN_STORY'])

#Writing the predictions to an excel sheet
pd.DataFrame(test_preds_mnb, columns = ['SECTION']).to_excel("NBpredictions.xlsx")

### Try out Logistic Regression

The logistic regression model is actually a statistical model developed by statistician
David Cox in 1958. It is also known as the logit or logistic model since it uses the
logistic (popularly also known as sigmoid) mathematical function to estimate the
parameter values. These are the coefficients of all our features such that the overall loss
is minimized when predicting the outcome—

In [None]:
%%time

#Training The Classifier
#Creating a Multinomial Naive Bayes Classifier
from sklearn.linear_model import LogisticRegression

#Fitting the training data to the classifier
classifier = LogisticRegression().fit(storytfidf, train_data['SECTION'])

CPU times: user 4.47 s, sys: 4.79 s, total: 9.26 s
Wall time: 4.81 s


In [None]:
#Creating A Pipeline To Pre-Process The Data & Initialise The Classifier
#Importing the Pipeline module from sklearn
from sklearn.pipeline import Pipeline

#Initializing the pipeline with necessary transformations and the required classifier
pipe = Pipeline([
('bow', CountVectorizer()),
('tfidf', TfidfTransformer()),
('classifier',LogisticRegression() )])

#Fitting the training data to the pipeline
pipe.fit(train_data['CLEAN_STORY'], train_data['SECTION'])

#Predicting the SECTION
test_preds_mnb = pipe.predict(test_data['CLEAN_STORY'])

#Writing the predictions to an excel sheet
pd.DataFrame(test_preds_mnb, columns = ['SECTION']).to_excel("LogRegpredictions.xlsx")

### Try out Random Forest

Decision trees are a family of supervised machine learning algorithms that can represent
and interpret sets of rules automatically from the underlying data. They use metrics like
information gain and gini-index to build the tree. However, a major drawback of decision
trees is that since they are non-parametric, the more data there is, greater the depth of
the tree. We can end up with really huge and deep trees that are prone to overfitting. The
model might work really well on training data, but instead of learning, it just memorizes
all the training samples and builds very specific rules to them. Hence, it performs really
poorly on the test data. Random forests try to tackle this problem.

A random forest is a meta-estimator or an ensemble model that fits a number of
decision tree classifiers on various sub-samples of the dataset and uses averaging to
improve the predictive accuracy and control over-fitting. The sub-sample size is always
the same as the original input sample size, but the samples are drawn with replacement
(bootstrap samples). In random forests, all the trees are trained in parallel (bagging
model/bootstrap aggregation). Besides this, each tree in the ensemble is built from a
sample drawn with replacement (i.e., a bootstrap sample) from the training set. Also,
when splitting a node during the construction of the tree, the split that is chosen is no
longer the best split among all features. Instead, the split that is picked is the best split
among a random subset of the features. T

In [None]:
%%time

#Training The Classifier
#Creating a Multinomial Naive Bayes Classifier
from sklearn.ensemble import RandomForestClassifier

#Fitting the training data to the classifier
classifier = RandomForestClassifier().fit(storytfidf, train_data['SECTION'])

CPU times: user 8.27 s, sys: 25.6 ms, total: 8.29 s
Wall time: 8.27 s


In [None]:
#Creating A Pipeline To Pre-Process The Data & Initialise The Classifier
#Importing the Pipeline module from sklearn
from sklearn.pipeline import Pipeline

#Initializing the pipeline with necessary transformations and the required classifier
pipe = Pipeline([
('bow', CountVectorizer()),
('tfidf', TfidfTransformer()),
('classifier',RandomForestClassifier())])

#Fitting the training data to the pipeline
pipe.fit(train_data['CLEAN_STORY'], train_data['SECTION'])

#Predicting the SECTION
test_preds_mnb = pipe.predict(test_data['CLEAN_STORY'])

#Writing the predictions to an excel sheet
pd.DataFrame(test_preds_mnb, columns = ['SECTION']).to_excel("RanForpredictions.xlsx")

#Newer Supervised Deep Learning Modelsimport gensim

In [87]:
import gensim
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Activation, Dense
from sklearn.preprocessing import OneHotEncoder
from keras.utils.np_utils import to_categorical

### Build Train and Test Datasets

In [111]:
# build train and test datasets
sentiments= train_data['SECTION'].values
reviews = train_data['STORY'].values

train_reviews = train_data['STORY']
train_sentiments = train_data['SECTION']

test_reviews = test_data['STORY']

### Text Wrangling & Normalization

In [112]:
import contractions
from bs4 import BeautifulSoup
import numpy as np
import re
import tqdm
import unicodedata


def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text

def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

def pre_process_corpus(docs):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    doc = strip_html_tags(doc)
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    doc = doc.lower()
    doc = remove_accented_chars(doc)
    doc = contractions.fix(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = re.sub(' +', ' ', doc)
    doc = doc.strip()  
    norm_docs.append(doc)
  
  return norm_docs

In [113]:
%%time

norm_train_reviews = pre_process_corpus(train_reviews)
norm_test_reviews = pre_process_corpus(test_reviews)

100%|██████████| 7551/7551 [00:02<00:00, 3383.51it/s]
100%|██████████| 2748/2748 [00:00<00:00, 3476.43it/s]

CPU times: user 2.98 s, sys: 33.5 ms, total: 3.01 s
Wall time: 3.03 s





### Traditional Supervised Machine Learning Models
Feature Engineering

In [116]:
%%time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)


# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

CPU times: user 6.3 s, sys: 107 ms, total: 6.41 s
Wall time: 6.42 s


In [117]:
%%time

# transform test reviews into features
cv_test_features = cv.transform(norm_test_reviews)
tv_test_features = tv.transform(norm_test_reviews)

CPU times: user 1.1 s, sys: 4.38 ms, total: 1.1 s
Wall time: 1.11 s


In [118]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (7551, 31634)  Test features shape: (2748, 31634)
TFIDF model:> Train features shape: (7551, 31634)  Test features shape: (2748, 31634)



###Prediction class label encoding



In [119]:
# converting y data into categorical (one-hot encoding)
y_train = to_categorical(train_sentiments)
#y_test = to_categorical(y_test)

# tokenize train reviews & encode train labels
tokenized_train = [nltk.word_tokenize(text)
                       for text in norm_train_reviews]
y_train = le.fit_transform(y_train)
# tokenize test reviews & encode test labels
tokenized_test = [nltk.word_tokenize(text)
                      for text in norm_test_reviews]
#y_test = le.fit_transform(test_sentiments)

###Feature Engineering with word embeddings

In [120]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [121]:
%%time
# build word2vec model
w2v_num_features = 300
w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features, window=150,
                                   min_count=10, workers=4, iter=5)    

2021-04-11 07:59:50,977 : INFO : collecting all words and their counts
2021-04-11 07:59:50,983 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-04-11 07:59:51,185 : INFO : collected 40121 word types from a corpus of 816390 raw words and 7551 sentences
2021-04-11 07:59:51,187 : INFO : Loading a fresh vocabulary
2021-04-11 07:59:51,540 : INFO : effective_min_count=10 retains 6820 unique words (16% of original 40121, drops 33301)
2021-04-11 07:59:51,543 : INFO : effective_min_count=10 leaves 741453 word corpus (90% of original 816390, drops 74937)
2021-04-11 07:59:51,569 : INFO : deleting the raw counts dictionary of 40121 items
2021-04-11 07:59:51,573 : INFO : sample=0.001 downsamples 36 most-common words
2021-04-11 07:59:51,576 : INFO : downsampling leaves estimated 568146 word corpus (76.6% of prior 741453)
2021-04-11 07:59:51,605 : INFO : estimated required memory for 6820 words and 300 dimensions: 19778000 bytes
2021-04-11 07:59:51,607 : INFO : resettin

CPU times: user 54.4 s, sys: 131 ms, total: 54.6 s
Wall time: 29 s


In [91]:
def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [122]:
# generate averaged word vector features from word2vec model
avg_wv_train_features = averaged_word2vec_vectorizer(corpus=tokenized_train, model=w2v_model,
                                                     num_features=w2v_num_features)
avg_wv_test_features = averaged_word2vec_vectorizer(corpus=tokenized_test, model=w2v_model,
                                                    num_features=w2v_num_features)

In [123]:
print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features.shape)

Word2Vec model:> Train features shape: (7551, 300)  Test features shape: (2748, 300)


In [124]:
from keras.layers import BatchNormalization

In [125]:
def construct_deepnn_architecture(num_input_features):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, input_shape=(num_input_features,), kernel_initializer='he_normal'))
    dnn_model.add(BatchNormalization())
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256, kernel_initializer='he_normal'))
    dnn_model.add(BatchNormalization())
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256, kernel_initializer='he_normal'))
    dnn_model.add(BatchNormalization())
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(4))
    dnn_model.add(Activation('softmax'))

    dnn_model.compile(loss='categorical_crossentropy', optimizer='adam',                 
                      metrics=['accuracy'])
    return dnn_model

In [126]:
w2v_dnn = construct_deepnn_architecture(num_input_features=w2v_num_features)

In [127]:
w2v_dnn.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 512)               154112    
_________________________________________________________________
batch_normalization_6 (Batch (None, 512)               2048      
_________________________________________________________________
activation_12 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 256)               131328    
_________________________________________________________________
batch_normalization_7 (Batch (None, 256)               1024      
_________________________________________________________________
activation_13 (Activation)   (None, 256)              

In [128]:
batch_size = 100
w2v_dnn.fit(avg_wv_train_features, to_categorical(train_sentiments), epochs=50, batch_size=batch_size, 
            shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f12442c9f10>

In [129]:
y_pred = w2v_dnn.predict_classes(avg_wv_test_features)



In [130]:
submission = pd.read_excel('/content/Sample_submission.xlsx')
submission['SECTION'] = y_pred
submission.to_excel('/content/DeepLearn_w2v_CNN.xlsx',index=False)

# LSTM

In [131]:
import tensorflow as tf

t = tf.keras.preprocessing.text.Tokenizer(oov_token='<UNK>')
# fit the tokenizer on the documents
t.fit_on_texts(norm_train_reviews)
t.word_index['<PAD>'] = 0

In [133]:
VOCAB_SIZE = len(t.word_index)

In [134]:
train_sequences = t.texts_to_sequences(norm_train_reviews)
test_sequences = t.texts_to_sequences(norm_test_reviews)
X_train = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=1000)
X_test = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=1000)

In [135]:
EMBEDDING_DIM = 300 # dimension for dense embeddings for each token
LSTM_DIM = 128 # total LSTM units

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=1000))
model.add(tf.keras.layers.SpatialDropout1D(0.1))
model.add(tf.keras.layers.LSTM(LSTM_DIM, return_sequences=False))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dense(4, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam",
              metrics=["accuracy"])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 300)         12036900  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 1000, 300)         0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               219648    
_________________________________________________________________
dense_16 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_17 (Dense)             (None, 4)                 1028      
Total params: 12,290,600
Trainable params: 12,290,600
Non-trainable params: 0
_________________________________________________________________


In [136]:
batch_size = 100
model.fit(X_train, to_categorical(train_sentiments), epochs=10, batch_size=batch_size, 
          shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

KeyboardInterrupt: ignored

In [138]:
predictions = model.predict_classes(X_test)
predictions[:10]



array([1, 2, 1, 0, 1, 1, 1, 2, 1, 2])

In [139]:
submission = pd.read_excel('/content/Sample_submission.xlsx')
submission['SECTION'] = predictions
submission.to_excel('/content/_LSTM.xlsx',index=False)

# GRU


In [140]:
EMBEDDING_DIM = 300 # dimension for dense embeddings for each token
GRU_DIM = 128 # total LSTM units

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=1000))
model.add(tf.keras.layers.SpatialDropout1D(0.1))
model.add(tf.keras.layers.GRU(GRU_DIM, return_sequences=False))
model.add(tf.keras.layers.Dense(256, activation='elu'))
model.add(tf.keras.layers.Dense(4, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam",
              metrics=["accuracy"])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 300)         12036900  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 1000, 300)         0         
_________________________________________________________________
gru (GRU)                    (None, 128)               165120    
_________________________________________________________________
dense_18 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_19 (Dense)             (None, 4)                 1028      
Total params: 12,236,072
Trainable params: 12,236,072
Non-trainable params: 0
_________________________________________________________________


In [142]:
batch_size = 100
model.fit(X_train, to_categorical(train_sentiments), epochs=5, batch_size=batch_size, 
          shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f1243420c50>

In [143]:
predictions = model.predict_classes(X_test)
predictions[:10]



array([1, 2, 1, 0, 1, 1, 1, 2, 1, 2])

In [144]:
submission = pd.read_excel('/content/Sample_submission.xlsx')
submission['SECTION'] = predictions
submission.to_excel('/content/GRU.xlsx',index=False)

In [147]:

import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

# CNN

In [155]:

EMBED_SIZE = 300
EPOCHS=2
BATCH_SIZE=128

In [156]:
# create the model
model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBED_SIZE, input_length=1000))
model.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1000, 300)         12036900  
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1000, 128)         153728    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 500, 128)          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 500, 64)           32832     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 250, 64)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 250, 32)           8224      
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 125, 32)          

In [159]:
X_train

array([[    0,     0,     0, ...,   121,    20,  1118],
       [    0,     0,     0, ...,  3456,  6825, 13056],
       [    0,     0,     0, ...,   318,     5, 22292],
       ...,
       [    0,     0,     0, ...,     4,  2020,  2860],
       [    0,     0,     0, ...,     9,    11,   683],
       [    0,     0,     0, ...,     4, 20650, 13299]], dtype=int32)

In [161]:
to_categorical(train_sentiments)

array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]], dtype=float32)

# 
Model Training

In [168]:
# Fit the model
model.fit(X_train, to_categorical(train_sentiments), 
          validation_split=0.1,
          epochs=EPOCHS, 
          batch_size=BATCH_SIZE, verbose=1)

Epoch 1/2


ValueError: ignored

#Model Evaluation

In [166]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

NameError: ignored

In [167]:
predictions = model.predict_classes(X_test).ravel()
predictions[:10]



array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [None]:
submission = pd.read_excel('/content/Sample_submission.xlsx')
submission['SECTION'] = predictions
submission.to_excel('/content/CNN.xlsx',index=False)