In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import re
import pickle
import numpy as np
import pandas as pd

import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [3]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

In [4]:
train = pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv')
valid = pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Valid.csv')
test = pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Test.csv')

In [5]:
dataset = pd.concat((train, valid, test))
dataset.set_index(np.array([i for i in range(50000)]))

In [6]:
# Replacing the values to ease understanding.
dataset['label'] = dataset['label'].replace(4,1)

# Plotting the distribution for dataset.
ax = dataset.groupby('label').count().plot(kind='bar', title='Distribution of data',
                                               legend=False)
ax.set_xticklabels(['Negative','Positive'], rotation=0)

# Storing data in lists.
text, sentiment = list(dataset['text']), list(dataset['label'])

In [7]:
# Defining dictionary containing all emojis with their meanings.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

## Defining set containing all stopwords in english.
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [8]:
from nltk.stem import WordNetLemmatizer

In [9]:
def preprocess(textdata):
    processedText = []
    
    # Create Lemmatizer and Stemmer
    wordLemm = WordNetLemmatizer()
    
    # Regex patterns
    urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in textdata:
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub(urlPattern,' URL',tweet)
        # Replace all emojis.
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])        
        # Replace @USERNAME to 'USER'.
        tweet = re.sub(userPattern,' USER', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            # If word not in stopwordlist
            if len(word)>1:
                
                word = wordLemm.lemmatize(word)
            
                tweetwords += (word+' ')
        processedText.append(tweetwords)
        
    return processedText

In [10]:
import time
t = time.time()
processedtext = preprocess(text)
print(f'Text Preprocessing complete.')
print(f'Time Taken: {round(time.time()-t)} seconds')

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processedtext, sentiment,
                                                    test_size = 0.1, random_state = 0)
print(f'Data Split done.')

In [12]:
np.array(X_train).shape

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from wordcloud import WordCloud
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [14]:
def prepare_data(X_train, X_test):
    #Tokenize the sentences
    tokenizer = Tokenizer()
    #preparing vocabulary
    tokenizer.fit_on_texts(processedtext)
    #converting text into integer sequences
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    #padding to prepare sequences of same length
    X_train=pad_sequences(X_train,maxlen=120)
    X_test=pad_sequences(X_test,maxlen=120)

    size_of_vocabulary = len(tokenizer.word_index)+1
    print("Vocabulary Size: " + str(size_of_vocabulary))

    return X_train, X_test, size_of_vocabulary

In [15]:
X_train, X_test, size_of_vocabulary = prepare_data(X_train, X_test)

In [16]:
from keras.layers import *
from keras.models import *
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [24]:
def build_BiLSTM(size_of_vocabulary):
    model = Sequential()
    #embedding layer
    model.add(Embedding(size_of_vocabulary,128,input_length=120))
    #lstm layer
    model.add(Bidirectional(LSTM(64,return_sequences=True,dropout=0.2)))
    #Global Maxpooling
    model.add(GlobalMaxPooling1D())
    #Dense Layer
    model.add(Dense(32,activation='relu'))
    model.add(Dropout(0.05))
    model.add(Dense(1,activation='sigmoid'))
    #Add loss function, metrics, optimizer
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    #Adding callbacks
    es = EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=3)
    mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', save_best_only=True,verbose=1)
    #summary
    return model, es, mc

def fit(model, X_train, X_test, y_train, y_test, es, mc):
    history = model.fit(X_train,y_train,batch_size=128,epochs=4, 
    validation_data=(X_test,y_test),verbose=1,callbacks=[es,mc])
    return model, history

In [18]:
X = np.concatenate((X_train, X_test))
y = np.concatenate((y_train, y_test))

In [25]:
model, es, mc = build_BiLSTM(size_of_vocabulary)
model.summary()

In [26]:
from sklearn.model_selection import KFold

n_split=5
histories = []
i = 1

for train_index,test_index in KFold(n_split).split(X):
    print("Fold {}".format(i))
    X_train,X_test=X[train_index],X[test_index]
    y_train,y_test=y[train_index],y[test_index]

    model, es, mc=build_BiLSTM(size_of_vocabulary)
    history=model.fit(X_train, y_train,batch_size=128,epochs=4)
    model.save('model_fold_{}.h5'.format(i))
    histories.append(history)
    
    print('Model evaluation ',model.evaluate(X_test,y_test))
    print("")
    i = i + 1

In [27]:
food_desert_data = pd.read_csv('../input/food-insecurity-dataset/Food_Deserts_and_Insecurity.csv')
food_desert_data

In [29]:
location = np.array(food_desert_data['user_location'])

In [30]:
import time
t = time.time()
text = preprocess(food_desert_data['text'])
print(f'Text Preprocessing complete.')
print(f'Time Taken: {round(time.time()-t)} seconds')

In [31]:
#Tokenize the sentences
tokenizer = Tokenizer()
#preparing vocabulary
tokenizer.fit_on_texts(text)
#converting text into integer sequences
processedtext = tokenizer.texts_to_sequences(text)
#padding to prepare sequences of same length
processedtext=pad_sequences(processedtext,maxlen=120)

In [32]:
from keras.models import load_model

In [38]:
model = load_model('./model_fold_1.h5')
sentiment = model.predict(processedtext)

In [39]:
sentiment_rounded = np.array(sentiment).reshape(len(sentiment),)
for i in range(len(sentiment_rounded)):
    sentiment_rounded[i] = int(round(sentiment_rounded[i]))

In [58]:
model.evaluate(X_test, np.array(y_test))

In [40]:
food = pd.DataFrame({"text": np.array(text).reshape(len(text),), "label": sentiment_rounded}, index=np.array([i for i in range(len(text))]))

In [41]:
from wordcloud import WordCloud,ImageColorGenerator
from PIL import Image
import urllib
import requests
str().find

In [72]:
all_words_positive = ' '.join(text for text in food['text'][food['label']==1])

# combining the image with the dataset
Mask = np.array(Image.open(requests.get('http://clipart-library.com/image_gallery2/Twitter-PNG-Image.png', stream=True).raw))

# We use the ImageColorGenerator library from Wordcloud 
# Here we take the color of the image and impose it over our wordcloud
image_colors = ImageColorGenerator(Mask)

# Now we use the WordCloud function from the wordcloud library 
wc = WordCloud(background_color='black', height=1500, width=4000,mask=Mask).generate(all_words_positive)

In [73]:
# Size of the image generated 
plt.figure(figsize=(10,20))

# Here we recolor the words from the dataset to the image's color
# recolor just recolors the default colors to the image's blue color
# interpolation is used to smooth the image generated 
plt.imshow(wc.recolor(color_func=image_colors),interpolation="hamming")
plt.title('Positive Review Common Text')
plt.axis('off')
plt.savefig('Positive_Review_Common_Text.png')
plt.show()

In [74]:
all_words_negative = ' '.join(text for text in food['text'][food['label']==0])

# combining the image with the dataset
Mask = np.array(Image.open(requests.get('http://clipart-library.com/image_gallery2/Twitter-PNG-Image.png', stream=True).raw))

# We use the ImageColorGenerator library from Wordcloud 
# Here we take the color of the image and impose it over our wordcloud
image_colors = ImageColorGenerator(Mask)

# Now we use the WordCloud function from the wordcloud library 
wc = WordCloud(background_color='black', height=1500, width=4000,mask=Mask).generate(all_words_negative)

In [75]:
# Size of the image generated 
plt.figure(figsize=(10,20))

# Here we recolor the words from the dataset to the image's color
# recolor just recolors the default colors to the image's blue color
# interpolation is used to smooth the image generated 
plt.imshow(wc.recolor(color_func=image_colors),interpolation="hamming")
plt.title('Negative Review Common Text')
plt.axis('off')
plt.savefig('Negative_Review_Common_Text.png')
plt.show()

In [80]:
def view_common_words(train):
    pos_freq = FreqDist(' '.join(train[train['label'] == 1].text).split(' '))
    plt.figure(figsize=(20,6))
    pos_freq.plot(50,cumulative=False,title='Positive Review Common Text', color='blue')
    plt.gcf()
    plt.savefig('Positive_Common_Text_Graph.png')
    plt.show()

    neg_freq = FreqDist(' '.join(train[train['label'] == 0].text).split(' '))
    plt.figure(figsize=(20,6))
    neg_freq.plot(50,cumulative=False,title='Negative Review Common Text',color='red')
    plt.gcf()
    plt.savefig('Negative_Common_Text_Graph.png')
    plt.show()

In [81]:
view_common_words(food)

In [49]:
food['location'] = location

In [50]:
food

In [51]:
p_food = food[food['label'] == 1.0]
n_food = food[food['label'] == 0.0]

In [52]:
p_food

In [53]:
n_food

In [54]:
p_locations = np.array(p_food['location'])
p_text = np.array(p_food['text'])
p_loc_clean = []
p_text_clean = []
for i in range(len(p_locations)):
    if type(p_locations[i]) == str:
        p_loc_clean.append(p_locations[i])
        p_text_clean.append(p_text[i])

n_locations = np.array(n_food['location'])
n_text = np.array(n_food['text'])
n_loc_clean = []
n_text_clean = []
for i in range(len(n_locations)):
    if type(n_locations[i]) == str:
        n_loc_clean.append(n_locations[i])
        n_text_clean.append(n_text[i])

In [55]:
p_food = pd.DataFrame({"text":p_text_clean, "location":p_loc_clean})
p_food

In [56]:
n_food = pd.DataFrame({"text":n_text_clean, "location":n_loc_clean})
n_food

In [86]:
p_loc = ' '.join(text for text in p_food['location'])

Mask = np.array(Image.open(requests.get('http://clipart-library.com/image_gallery2/Twitter-PNG-Image.png', stream=True).raw))
image_colors = ImageColorGenerator(Mask)
wc = WordCloud(background_color='black', height=1500, width=4000,mask=Mask).generate(p_loc)

plt.figure(figsize=(10,20))
plt.imshow(wc.recolor(color_func=image_colors),interpolation="hamming")
plt.title('Positive Locations')
plt.axis('off')
plt.savefig('Positive_Locations.png')
plt.show()

In [87]:
n_loc = ' '.join(text for text in n_food['location'])

Mask = np.array(Image.open(requests.get('http://clipart-library.com/image_gallery2/Twitter-PNG-Image.png', stream=True).raw))
image_colors = ImageColorGenerator(Mask)
wc = WordCloud(background_color='black', height=1500, width=4000,mask=Mask).generate(n_loc)

plt.figure(figsize=(10,20))
plt.imshow(wc.recolor(color_func=image_colors),interpolation="hamming")
plt.title('Negative Locations')
plt.axis('off')
plt.savefig('Negative_Locations.png')
plt.show()