In [17]:
#https://medium.com/@saad.arshad102/sentiment-analysis-text-classification-using-rnn-bi-lstm-recurrent-neural-network-81086dda8472

In [5]:
import pandas as pd
import re
import string
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

temp = pd.read_csv('finalData.csv')
data = pd.DataFrame()
data = temp[['description', 'civic_issue']].copy()
data.head()
len(data)

[nltk_data] Downloading package punkt to /home/shriya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


32120

In [6]:
#number of rows, columns
data.shape 

#number of positive, negative (positive -> civic, negative -> non civic)
data.civic_issue.value_counts()
data.head()

Unnamed: 0,description,civic_issue
0,Garbage behind the temple,1
1,Air pollution,1
2,Air pollution in hebbal,1
3,Garbage is dumped near BES,1
4,Leaf and garbage burning on Shakthi Ganapathi ...,1


In [7]:
#data cleaning
# 1) we remove punctuation marks

def remove_punctuation(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    #text_nopunct = str.maketrans('', '', string.punctuation)
    return text_nopunct
data['Text_Clean'] = data['description'].apply(lambda x: remove_punctuation(x))

print(data['Text_Clean'])

0                                Garbage behind the temple
1                                            Air pollution
2                                  Air pollution in hebbal
3                               Garbage is dumped near BES
4        Leaf and garbage burning on Shakthi Ganapathi ...
                               ...                        
32115    gets new phone hopefully tomorrow but mail doe...
32116    LeslieG stack is injured  are you a rowdy fan ...
32117                        ryanbader  Bummerlove Chuck  
32118         Kinda sad that my show time is over for now 
32119              good night and its still only day four 
Name: Text_Clean, Length: 32120, dtype: object


In [8]:
# 2) tokenisation (split the sentences into its respective words)
tokens = [(word_tokenize(str(sen))) for sen in data.Text_Clean]


In [9]:
# 3) convert these tokenised words into their lower cases
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens]

In [10]:
# 4) remove the stopwords using NLTK's stopwords
stoplist = stopwords.words('english')
def removeStopWords(tokens): 
    return [word for word in tokens if word not in stoplist]
filtered_words = [removeStopWords(sen) for sen in lower_tokens]
data['Text_Final'] = [' '.join(sen) for sen in filtered_words]
data['tokens'] = filtered_words

In [51]:
# since we have a binary category model (civic issue or non civic issue), 
#we add two one hot encoded columns to our data frame

civic = []
non_civic = []
for l in data.civic_issue:
    if l == 0:
        civic.append(0)
        non_civic.append(1)
    elif l == 1:
        civic.append(1)
        non_civic.append(0)
data['Civic']= civic
data['Non_Civic']= non_civic
data = data[['Text_Final', 'tokens', 'civic_issue', 'Civic', 'Non_Civic']]
data.head()

Unnamed: 0,Text_Final,tokens,civic_issue,Civic,Non_Civic
0,garbage behind temple,"[garbage, behind, temple]",1,1,0
1,air pollution,"[air, pollution]",1,1,0
2,air pollution hebbal,"[air, pollution, hebbal]",1,1,0
3,garbage dumped near bes,"[garbage, dumped, near, bes]",1,1,0
4,leaf garbage burning shakthi ganapathi temple ...,"[leaf, garbage, burning, shakthi, ganapathi, t...",1,1,0


In [11]:
# we split the data into training and testing. i have taken 80-20. 
# not entirely sure if we should use the 'random_state' parameter.

data_train, data_test = train_test_split(data, test_size=0.20, random_state=42)

In [12]:
# we find the total number of words in our training dataset and find max sentence lengeth.

all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

263691 words total, with a vocabulary size of 34562
Max sentence length is 47


In [14]:
# repeat same thing for the test dataset.

all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))


65832 words total, with a vocabulary size of 13859
Max sentence length is 44
