## Reading Libraries 

In [1]:
from keras.models import Sequential
from keras import layers
from keras.utils import to_categorical
from numpy import argmax

Using TensorFlow backend.


In [2]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
import nltk
from nltk.tokenize import MWETokenizer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn import metrics
import numpy as np
from nltk.stem import WordNetLemmatizer
import re
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer

In [3]:
lab_data = pd.read_csv('./Training Dataset-20191010/labeled_data.csv')
unlabeled_data = pd.read_csv('./Training Dataset-20191010/unlabeled_data.csv')

In [4]:
freq = pd.Series(' '.join(lab_data['text']).split()).value_counts()[-10:]
freq
freq = list(freq.index)
lab_data['text'] = lab_data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
lab_data['text'].head()

0    The new rule is - if you are waiting for a tab...
1    Flirted with giving this two stars, but that's...
2    I was staying at planet Hollywood across the s...
3    Food is good but prices are super expensive. 8...
4    Worse company to deal with they do horrible wo...
Name: text, dtype: object

## Case Normalisation, Tokenization and most frequent words

In [5]:
freq = pd.Series(' '.join(unlabeled_data['text']).split()).value_counts()[-10:]
freq
freq = list(freq.index)
unlabeled_data['text'] = unlabeled_data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
unlabeled_data['text'].head()


0    Had a good experience when my wife and I sat a...
1    On my first to Montreal with my gf we came her...
2    One of our favorite places to go when it's col...
3    The doctor was very nice, got in in a good amo...
4    The Nook is an immediate phoenix staple! I cam...
Name: text, dtype: object

In [6]:
def remove_extra_characters(raw_text):
    processed_text = re.sub('\\n','', raw_text)
    processed_text = re.sub('\\r','', processed_text)
    processed_text = re.sub("\\'", "\'",processed_text)
    processed_text = re.sub(r'\d+','', processed_text)
    return processed_text

In [7]:
lab_data['text'] = lab_data.apply(lambda row: remove_extra_characters(row['text'].strip()), axis=1)

In [8]:
unlabeled_data['text'] = unlabeled_data.apply(lambda row: remove_extra_characters(row['text'].strip()), axis=1)

In [None]:
def lemmatization(token_list):
    lemmatizer = WordNetLemmatizer()
    lem_token = []
    for each in token_list :
#         print(each ,":", lemmatizer.lemmatize(each)) 
        lem_token.append(lemmatizer.lemmatize(each))
    return lem_token

In [9]:
tokenizer = RegexpTokenizer("\w+(?:[']\w+)?")

In [10]:
def token(raw_data):
    raw_data1 = raw_data.lower()
    tokenised = tokenizer.tokenize(raw_data1)
#     tokenised = nltk.tokenize.word_tokenize(raw_data1)
    #lem_token = lemmatization(tokenised)
#     stopwords_tokens = [w for w in tokenised if not w in stopwords]
    #processed_data = ' '.join(lem_token)
    processed_data = ' '.join(tokenised)
        
    return(processed_data)

In [11]:
lab_data['text'] = lab_data.apply(lambda row: token(row['text'].strip()), axis=1)

In [12]:
unlabeled_data['text'] = unlabeled_data.apply(lambda row: token(row['text'].strip()), axis=1)

## TFIDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer(lowercase = True,analyzer = 'word',ngram_range = (1,2), min_df=3, max_df=.99)
    
train_review = vectorizer.fit_transform(lab_data['text'])

In [14]:
X = lab_data['text'].tolist()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(train_review, lab_data['label'],test_size=0.20, random_state=1)

In [16]:
input_dim = X_train.shape[1]  # Number of features

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', 
               optimizer='adam', 
               metrics=['accuracy'])
model.summary()
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
cnn_model = model.fit(X_train, y_train,
                     epochs=2,
                    verbose=False,
                     validation_data=(X_test, y_test),
                     batch_size=30)






Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 10)                2374850   
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 66        
Total params: 2,374,916
Trainable params: 2,374,916
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [17]:
def acc(y_true, y_pred):
    return np.equal(np.argmax(y_true, axis=-1), np.argmax(y_pred, axis=-1)).mean()


In [18]:
y_pred = model.predict(X_test)

In [19]:
print("accuracy: " + str(acc(y_test, y_pred)))

accuracy: 0.6274


In [None]:
remaining_unlabeled = unlabeled_data.copy()

In [None]:
unlabeled_test = vectorizer.transform(remaining_unlabeled['text'])
# pred_class = model.predict(unlabeled_test)
# pred_probab = model.predict_proba(unlabeled_test)

In [None]:
pred_class = model.predict(unlabeled_test)

In [None]:
p_pred = []
for i in range(len(pred_class)):
    p_pred.append(argmax(pred_class[i]))

In [None]:
pred_probab = model.predict_proba(unlabeled_test)

In [None]:
p_test = []
for i in range(len(pred_probab)):
    p_test.append(max(pred_probab[i]))

In [None]:
remaining_unlabeled['label'] = p_pred
remaining_unlabeled['probability'] = p_test

In [None]:
new_train_data = remaining_unlabeled[remaining_unlabeled['probability'] > 0.95]
print("length of obtained train data:", len(new_train_data))
    
remaining_unlabeled = remaining_unlabeled[remaining_unlabeled['probability'] <= 0.95]
print("length of remaining data:", len(remaining_unlabeled))
    
new_train_data.drop(['probability'], axis=1, inplace=True)
remaining_unlabeled.drop(['probability','label'], axis=1, inplace=True)

In [None]:
train_data = pd.concat([lab_data, new_train_data])
print("length of train data:", len(train_data))

In [None]:
vectorizer = TfidfVectorizer(lowercase = True,analyzer = 'word',ngram_range = (1,2), min_df=3, max_df=.99)
train = vectorizer.fit_transform(train_data['text'])
X_train, X_test, y_train, y_test = train_test_split(train, train_data['label'],test_size=0.20, random_state=1)

In [None]:
input_dim = X_train.shape[1]  # Number of features

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', 
               optimizer='adam', 
               metrics=['accuracy'])
model.summary()
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
model.fit(X_train, y_train,
                     epochs=2,
                    verbose=False,
                     validation_data=(X_test, y_test),
                     batch_size=30)

In [None]:
y_pred = model.predict(X_test)
    
print("accuracy: " + str(acc(y_test, y_pred)))

In [20]:
remaining_unlabeled = unlabeled_data.copy()

In [21]:
for j in range(1):
    unlabeled_test = vectorizer.transform(remaining_unlabeled['text'])
    pred_class = model.predict(unlabeled_test)
    
    p_pred = []
    for i in range(len(pred_class)):
        p_pred.append(argmax(pred_class[i]))
        
    pred_probab = model.predict_proba(unlabeled_test)
    
    p_test = []
    
    for i in range(len(pred_probab)):
        p_test.append(max(pred_probab[i]))
    
    
    remaining_unlabeled['label'] = p_pred
    remaining_unlabeled['probability'] = p_test
    
    new_train_data = remaining_unlabeled[remaining_unlabeled['probability'] > 0.95]
    print("length of obtained train data:", len(new_train_data))
    
    remaining_unlabeled = remaining_unlabeled[remaining_unlabeled['probability'] <= 0.95]
    print("length of remaining data:", len(remaining_unlabeled))
    
    new_train_data.drop(['probability'], axis=1, inplace=True)
    remaining_unlabeled.drop(['probability','label'], axis=1, inplace=True)
    
    if j == 0:
        train_data = pd.concat([lab_data, new_train_data])
        print("length of train data:", len(train_data))
    else:
        td = train_data.copy()
        train_data = pd.concat([td, new_train_data])
        print("length of train data:", len(train_data))
    
    vectorizer = TfidfVectorizer(lowercase = True,analyzer = 'word',ngram_range = (1,2), min_df=3, max_df=.99)
    train = vectorizer.fit_transform(train_data['text'])
    X_train, X_test, y_train, y_test = train_test_split(train, train_data['label'],test_size=0.20, random_state=1)
    
    
    input_dim = X_train.shape[1]  # Number of features

    model = Sequential()
    model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(6, activation='softmax'))
    model.compile(loss='categorical_crossentropy', 
               optimizer='adam', 
               metrics=['accuracy'])
    model.summary()
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)
    model.fit(X_train, y_train,
                     epochs=2,
                    verbose=False,
                     validation_data=(X_test, y_test),
                     batch_size=30)
    
    y_pred = model.predict(X_test)
    
    print("accuracy: " + str(acc(y_test, y_pred)))


length of obtained train data: 9989
length of remaining data: 590011
length of train data: 59989
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 10)                2814850   
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 66        
Total params: 2,814,916
Trainable params: 2,814,916
Non-trainable params: 0
_________________________________________________________________
accuracy: 0.6840306717786298


In [22]:
test_data = pd.read_csv("./test_data.csv")

In [23]:
test_data.head()

Unnamed: 0,test_id,text
0,test_1,trying to have a nice quiet dinner. the annou...
1,test_2,Been getting food to go from here for over 3yr...
2,test_3,Ugh. I've had to eat here a couple of times be...
3,test_4,The people here are so nice! I ordered on eat ...
4,test_5,Heard alot of good things about this place and...


In [24]:
test_data['text'] = test_data.apply(lambda row: remove_extra_characters(row['text'].strip()), axis=1) 

In [25]:
test_data['text'] = test_data.apply(lambda row: token(row['text'].strip()), axis=1)

In [26]:
test = vectorizer.transform(test_data['text'])

In [27]:
pred_class = model.predict(test)

In [28]:
p_pred = []
for i in range(len(pred_class)):
    p_pred.append(argmax(pred_class[i]))

In [29]:
pred_data = pd.DataFrame({'test_id':test_data['test_id'], 'label':p_pred})

In [30]:
pred_data.to_csv("predict_label_cnn_1.csv", index=False)