<a href="https://colab.research.google.com/github/SelinErcan/NLP/blob/main/sentiment_analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install word2vec-keras
!pip install mlflow



In [None]:
from sklearn.datasets import fetch_20newsgroups
from word2vec_keras import Word2VecKeras
from pprint import pprint
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import numpy as np
import nltk
import string
import re
import ast # abstract syntax tree: https://docs.python.org/3/library/ast.html
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn

import io
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from google.colab import drive 
drive.mount('/content/gdrive')

%matplotlib inline

import tensorflow as tf 
tf.logging.set_verbosity(tf.logging.ERROR)

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Preprocessing

In [None]:
class Preprocessing(object):
    def __init__(self, data, target_column_name='body_text_clean'):
        self.data = data
        self.feature_name = target_column_name
        
    def remove_punctuation(self, text):
        # string.punctuation: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
        text_nopunct = "".join([char for char in text if char not in string.punctuation])# It will discard all punctuations
        return text_nopunct
    
    def tokenize(self, text):
        #W+ Matches one or more characters which are not word character.
        tokens = re.split(r'\W+', text) 
        return tokens
    
    def remove_stopwords(self, tokenized_list):
        '''
        download stopwords zip file at: http://www.nltk.org/nltk_data/
        move the unzipped directory stopwords to: /Users/yuhuang/anaconda3/envs/deeplearning/nltk_data/corpora
        '''
        # Remove all English Stopwords
        stopword = nltk.corpus.stopwords.words('english')
        text = [word for word in tokenized_list if word not in stopword]
        return text   

    def stemming(self, tokenized_text):
        ps = nltk.PorterStemmer()
        text = [ps.stem(word) for word in tokenized_text]
        return text
    
    def lemmatizing(self, tokenized_text):
        '''
        download wordnet at: http://www.nltk.org/nltk_data/
        move wprdnet.zip file to: /Users/yuhuang/anaconda3/envs/deeplearning/nltk_data/corpora
        '''
        wn = nltk.WordNetLemmatizer()
        text = [wn.lemmatize(word) for word in tokenized_text]
        return text
    
    def tokens_to_string(self, tokens_string):
        try:
            list_obj = ast.literal_eval(tokens_string)
            text = " ".join(list_obj)
        except:
            text = None
        return text
    
    def dropna(self):
        feature_name = self.feature_name
        if self.data[feature_name].isnull().sum() > 0:
            column_list=[feature_name]
            self.data = self.data.dropna(subset=column_list)
            return self.data
        
    def preprocessing(self, feature):
        self.data['body_text_nopunc'] = self.data['body_text'].apply(lambda x: self.remove_punctuation(x))
        self.data['body_text_tokenized'] = self.data['body_text_nopunc'].apply(lambda x: self.tokenize(x.lower())) 
        self.data['body_text_nostop'] = self.data['body_text_tokenized'].apply(lambda x: self.remove_stopwords(x))
        self.data['body_text_stemmed'] = self.data['body_text_nostop'].apply(lambda x: self.stemming(x))
        self.data['body_text_lemmatized'] = self.data['body_text_nostop'].apply(lambda x: self.lemmatizing(x))

        # save cleaned dataset into csv file and load back
        self.save()
        self.load()
        
        self.data[self.feature_name] = self.data['body_text_lemmatized'].apply(lambda x: self.tokens_to_string(x))
        
        self.dropna() # error occurred!!!

        drop_non_featured = ''
        if feature == 'body_text':
          drop_non_featured = 'body_text_clean'
        else:
          drop_non_featured = 'body_text'

        drop_columns = ['body_text_nopunc', 'body_text_tokenized', 'body_text_nostop', 'body_text_stemmed', 'body_text_lemmatized', drop_non_featured] 
        self.data.drop(drop_columns, axis=1, inplace=True)
        
        
        return self.data
    
    def save(self, filepath="sentiment_cleaned.csv"):
        self.data.to_csv(filepath, index=False, sep=',')  
        
    def load(self, filepath="sentiment_cleaned.csv"):
        self.data = pd.read_csv(filepath)
        return self.data

## Classification

In [None]:
class SentimentAnalyzer(object):
    def __init__(self):
        self.model = Word2VecKeras()
        
    def load_data(self):
        '''
        Changed for sentiment analysis
        '''
        file_positive = 'gdrive/My Drive/Positive_RAW.txt'
        file_negative = 'gdrive/My Drive/Negative_RAW.txt'
        positive_data = []
        negative_data = []
        with open(file_positive) as file_positive:
          for line in file_positive:
            positive_data.append(["positive",line])
        with open(file_negative) as file_negative:
          for line in file_negative:
            negative_data.append(["negative",line])

        positive_df = pd.DataFrame(positive_data, columns =['label', 'body_text']) 
        negative_df = pd.DataFrame(negative_data, columns =['label', 'body_text'])

        df=positive_df.append(negative_df, ignore_index=True)
        self.raw_data = df.sample(frac=1.0) 
        
        print('Rows: {}, Columns: {}'.format(self.raw_data.shape[0], self.raw_data.shape[1]))
        print("Total rows: {}, positive: {}, negative: {}".format(len(self.raw_data),
                                                       len(self.raw_data[self.raw_data['label']=='positive']),
                                                       len(self.raw_data[self.raw_data['label']=='negative'])))
        
        print("Total number of missing labels: {}".format(self.raw_data['label'].isnull().sum()))
        print("Total number of missging text: {}".format(self.raw_data['body_text'].isnull().sum()))

        return self.raw_data
    
    def split_data(self):
        # Shuffle and split the data into training and testing subsets
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, test_size=0.25, random_state=42)
        
    def numpy_to_list(self):
        '''
        convert Numpy ndarray to Python list for word2vec-keras API
        '''
        self.x_train = self.x_train.tolist()
        self.y_train = self.y_train.tolist()
        self.x_test  = self.x_test.tolist()
        self.y_test  = self.y_test.tolist()
    
    def prepare_data(self, feature, label='label'):
        self.load_data()
        pp = Preprocessing(self.raw_data)
        self.data = pp.preprocessing(feature)
        
        print('self.data[feture] type: ', type(self.data))
        
        self.x = self.data[feature].values
        self.y = self.data[label].values
        self.split_data()
        self.numpy_to_list()
        
        print(self.data)
        return self.data
        
    def train_model(self):
        '''
        w2v_min_count
        
        RuntimeError: you must first build vocabulary before training the model.
        
        You configured a min_count of 50 (-m 50), but maybe there is no word in 
        your vocabulary with frequency greater than 50, hence your vocab will be empty and 
        gensim returns the error. Try a lower min_count ...
        '''
        self.w2v_size = 300
        self.w2v_min_count = 1 # 5
        self.w2v_epochs = 100
        self.k_epochs = 5 # 32
        self.k_lstm_neurons = 512
        self.k_max_sequence_len = 1000
        
        self.model.train(self.x_train, self.y_train, 
            w2v_size=self.w2v_size, 
            w2v_min_count=self.w2v_min_count, 
            w2v_epochs=self.w2v_epochs, 
            k_epochs=self.k_epochs, 
            k_lstm_neurons=self.k_lstm_neurons, 
            k_max_sequence_len=self.k_max_sequence_len, 
            k_hidden_layer_neurons=[])
        
    def evaluate(self):
        self.result = self.model.evaluate(self.x_test, self.y_test)
        self.accuracy = self.result["ACCURACY"]
        self.clf_report_df = pd.DataFrame(self.result["CLASSIFICATION_REPORT"])
        self.cnf_matrix = self.result["CONFUSION_MATRIX"]
        print('Confusion Matrix: ', self.cnf_matrix)
        return self.result
    
    def predict(self, idx=1):
        print("LABEL:", self.y_test[idx])
        print("TEXT :", self.x_test[idx])
        print("/n============================================")
        print("PREDICTION:", self.model.predict(self.x_test[idx]))
        
    def mlFlow(self, feature='body_text_clean'):
        np.random.seed(40)  
        with mlflow.start_run():
            self.prepare_data(feature=feature) # feature should be 'body_text' if no need to preprocessing
            self.train_model()
            self.evaluate()
            self.predict()
            mlflow.log_param("feature", feature) 
            mlflow.log_param("w2v_size", self.w2v_size)  
            mlflow.log_param("w2v_min_count", self.w2v_min_count)
            mlflow.log_param("w2v_epochs", self.w2v_epochs)
            mlflow.log_param("k_lstm_neurons", self.k_lstm_neurons)
            mlflow.log_param("k_max_sequence_len", self.k_max_sequence_len)
            mlflow.log_metric("accuracy", self.accuracy)
            mlflow.sklearn.log_model(self.model, "Word2Vec-Keras")
        

## No Preprocessing


In [None]:
sentiment_clf_np = SentimentAnalyzer()
sentiment_clf_np.mlFlow(feature='body_text')

Rows: 10655, Columns: 2
Total rows: 10655, positive: 5327, negative: 5328
Total number of missing labels: 0
Total number of missging text: 0


2020-01-16 07:39:45,891 : INFO : Build & train Word2Vec model
2020-01-16 07:39:45,893 : INFO : collecting all words and their counts
2020-01-16 07:39:45,894 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


self.data[feture] type:  <class 'pandas.core.frame.DataFrame'>
          label                                          body_text
0      positive    benim çok nadirdir bir filmi birden fazla iz...
1      positive    harika bir klasik.ayrica filmin sonlarinda k...
2      positive    sanilanin aksime türkçe düblajla da gayet gü...
3      negative    lütfen gitmeyin paraniz ve zamaniniz yaninda...
4      positive    siki bir film.meraklilarina tavsiye ederim.....
...         ...                                                ...
10650  negative    ne oyunlar döndügünü güzel anlatiyor. filme ...
10651  positive    kimseyi bilmem ama bnm psikolojim bozuldu ya...
10652  negative    hayatimda sinemada izledigim en igrenç film,...
10653  negative     yazik yazik,imdb top 100 de 1. sirada!sinem...
10654  negative    bir sinemada bukadar sikilip daraldigimi hat...

[10655 rows x 2 columns]


2020-01-16 07:39:45,928 : INFO : collected 24187 word types from a corpus of 158125 raw words and 7991 sentences
2020-01-16 07:39:45,929 : INFO : Loading a fresh vocabulary
2020-01-16 07:39:45,965 : INFO : effective_min_count=1 retains 24187 unique words (100% of original 24187, drops 0)
2020-01-16 07:39:45,965 : INFO : effective_min_count=1 leaves 158125 word corpus (100% of original 158125, drops 0)
2020-01-16 07:39:46,030 : INFO : deleting the raw counts dictionary of 24187 items
2020-01-16 07:39:46,031 : INFO : sample=0.001 downsamples 39 most-common words
2020-01-16 07:39:46,032 : INFO : downsampling leaves estimated 132432 word corpus (83.8% of prior 158125)
2020-01-16 07:39:46,071 : INFO : estimated required memory for 24187 words and 300 dimensions: 70142300 bytes
2020-01-16 07:39:46,072 : INFO : resetting layer weights
2020-01-16 07:39:50,256 : INFO : training model with 2 workers on 24187 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-01-16

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 300)         7256400   
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               1665024   
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1026      
Total params: 8,922,450
Trainable params: 1,666,050
Non-trainable params: 7,256,400
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


2020-01-16 07:47:48,665 : INFO : Done


Confusion Matrix:  [[1173  174]
 [ 153 1164]]
LABEL: negative
TEXT :    ilk filmden daha kötu hayal kirikligina ugradim. 

PREDICTION: {'label': 'negative', 'confidence': 0.6220336556434631, 'elapsed_time': 0.568145751953125}


## Preprocessed



In [None]:
sentiment_clf = SentimentAnalyzer()
sentiment_clf.mlFlow()

Rows: 10655, Columns: 2
Total rows: 10655, positive: 5327, negative: 5328
Total number of missing labels: 0
Total number of missging text: 0


2020-01-16 07:48:44,959 : INFO : Build & train Word2Vec model
2020-01-16 07:48:44,960 : INFO : collecting all words and their counts
2020-01-16 07:48:44,961 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-01-16 07:48:44,992 : INFO : collected 27250 word types from a corpus of 146670 raw words and 7991 sentences
2020-01-16 07:48:44,992 : INFO : Loading a fresh vocabulary


self.data[feture] type:  <class 'pandas.core.frame.DataFrame'>
          label                                    body_text_clean
0      positive   benim çok nadirdir bir filmi birden fazla izl...
1      positive   harika bir klasikayrica filmin sonlarinda küç...
2      positive   sanilanin aksime türkçe düblajla da gayet güz...
3      negative   lütfen gitmeyin paraniz zamaniniz yaninda sin...
4      positive         siki bir filmmeraklilarina tavsiye ederim 
...         ...                                                ...
10650  negative   ne oyunlar döndügünü güzel anlatiyor filme gü...
10651  positive   kimseyi bilmem ama bnm psikolojim bozuldu ya ...
10652  negative   hayatimda sinemada izledigim en igrenç film h...
10653  negative   yazik yazikimdb top 100 de 1 siradasinema sev...
10654  negative   bir sinemada bukadar sikilip daraldigimi hati...

[10655 rows x 2 columns]


2020-01-16 07:48:45,032 : INFO : effective_min_count=1 retains 27250 unique words (100% of original 27250, drops 0)
2020-01-16 07:48:45,033 : INFO : effective_min_count=1 leaves 146670 word corpus (100% of original 146670, drops 0)
2020-01-16 07:48:45,098 : INFO : deleting the raw counts dictionary of 27250 items
2020-01-16 07:48:45,099 : INFO : sample=0.001 downsamples 36 most-common words
2020-01-16 07:48:45,100 : INFO : downsampling leaves estimated 124110 word corpus (84.6% of prior 146670)
2020-01-16 07:48:45,145 : INFO : estimated required memory for 27250 words and 300 dimensions: 79025000 bytes
2020-01-16 07:48:45,146 : INFO : resetting layer weights
2020-01-16 07:48:49,785 : INFO : training model with 2 workers on 27250 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-01-16 07:48:50,044 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-01-16 07:48:50,059 : INFO : worker thread finished; awaiting finish of 0 more threads
2

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 300)         8175300   
_________________________________________________________________
lstm_2 (LSTM)                (None, 512)               1665024   
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 1026      
Total params: 9,841,350
Trainable params: 1,666,050
Non-trainable params: 8,175,300
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


2020-01-16 07:56:51,363 : INFO : Done


Confusion Matrix:  [[1194  153]
 [ 227 1090]]
LABEL: negative
TEXT :  ilk filmden daha kötu hayal kirikligina ugradim 
PREDICTION: {'label': 'negative', 'confidence': 0.7479504346847534, 'elapsed_time': 0.5456364154815674}
