*Update V3 : Fixed calculation for LOGPRIOR value in the function create_naive_bayes_map*

In [None]:
import numpy as np
import pandas as pd 
import tqdm.notebook as tqdm
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords         
from nltk.stem import PorterStemmer  

In [None]:
tqdm.tqdm_notebook.pandas()
pd.set_option('display.max_colwidth', None)

In [None]:
STOP_WORDS = stopwords.words('english') 
PUNCTUATIONS = '"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## Load Data
***

In [None]:
train_data = pd.read_csv('../input/nlp-getting-started/train.csv')
test_data = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
train_data.head()

In [None]:
def clean_text(text):
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    text = re.sub(r'#', '', text)
    return text

def remove_stop_words_and_puntuation(tokenized_text):
    text_clean = []
    
    for word in tokenized_text:
        if (word not in PUNCTUATIONS and word not in STOP_WORDS):
            text_clean.append(word)
            
    return text_clean
    
def stemm_text(tokenized_text):
    text_stemm = []
    
    stemmer = PorterStemmer()
    
    for word in tokenized_text:
        text_stemm.append(stemmer.stem(word))
        
    return text_stemm

def process_text(text):
    text = clean_text(text)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    text = tokenizer.tokenize(text)
    
    text = remove_stop_words_and_puntuation(text)
    text = stemm_text(text)
            
    return text

def create_frequency_map(data: pd.DataFrame, process_text_enabled = False):
    data_dict = data.to_dict()

    frequency_map = {}

    for idx in tqdm.tqdm_notebook(range(len(data_dict["id"]))):
        if process_text_enabled == True:
            words = process_text(data_dict["text"][idx])
        else:
            words = data_dict["text"][idx]
            
        for word in words:
            target = data_dict["target"][idx]
            
            if (word.lower(), target) in frequency_map:
                frequency_map[(word.lower(), target)] += 1
            else:
                frequency_map[(word.lower(), target)] = 1

    return frequency_map
    
def process_train_data(train_data : pd.DataFrame):
    data = train_data.copy()
    
    data['processed_text'] = data['text'].progress_apply(lambda x : process_text(x)) 
    
    return data

def create_naive_bayes_map(data : pd.DataFrame, process_text_enabled):
    
    data_dict = data.to_dict()

    naive_bayes_map = {}

    frequency_map = create_frequency_map(data, process_text_enabled)
    total_pos = 0.0
    total_neg = 0.0
    unique_words = {}
   
    for key in frequency_map:
        if key[1] == 1:
            total_pos += frequency_map[key]
        else:
            total_neg += frequency_map[key]
        if key[0] not in unique_words:
            unique_words[key[0]] = []
            
    total_unique_words = len(unique_words)
    
    total_target_pos = 0
    total_target_neg = 0
    
    for idx in data_dict['target']:
        if data_dict['target'][idx] == 1:
            total_target_pos += 1
        else:
            total_target_neg += 1
    
    for key in frequency_map:

        if key[0] not in naive_bayes_map:
                naive_bayes_map[key[0]] = {'pos': 0.0, 'neg': 0.0, 
                                           'pos_smooth' : 1 / (total_pos + total_unique_words), 
                                           'neg_smooth' : 1 / (total_neg + total_unique_words) }

        if key[1] == 1:
            naive_bayes_map[key[0]]['pos'] = frequency_map[key] /  total_pos
            naive_bayes_map[key[0]]['pos_smooth'] = calculate_laplacian_smoothing(frequency_map[key], total_pos, total_unique_words)
            
        else:
            naive_bayes_map[key[0]]['neg'] = frequency_map[key] / total_neg
            naive_bayes_map[key[0]]['neg_smooth'] = calculate_laplacian_smoothing(frequency_map[key], total_neg, total_unique_words)
            
    
    for key in naive_bayes_map:
        word_lambda = np.log(naive_bayes_map[key]['pos_smooth'] / naive_bayes_map[key]['neg_smooth'])
        naive_bayes_map[key]['lambda'] = word_lambda
    
    log_prior = np.log(total_target_pos/total_target_neg)
    
    sum_pos = 0
    sum_neg = 0
    sum_pos_smooth = 0
    sum_neg_smooth = 0                         
                             
    for key in naive_bayes_map:
        sum_pos += naive_bayes_map[key]['pos']
        sum_neg += naive_bayes_map[key]['neg']
        sum_pos_smooth += naive_bayes_map[key]['pos_smooth']
        sum_neg_smooth += naive_bayes_map[key]['neg_smooth']
        
    
    
    print(f'POS : {total_pos}, NEG :{total_neg}, Unique_words : {total_unique_words}, LOG_PRIOR : {log_prior}')
    print(f'SUM_POS : {sum_pos}, SUM_NEG : {sum_neg}, SUM_POS_SMOOTH : {sum_pos_smooth}, SUM_NEG_SMOOTH : {sum_neg_smooth}')
        
   
    return naive_bayes_map, log_prior
 
def calculate_laplacian_smoothing(freq_value, total , total_unique_words):
    return (freq_value + 1) / (total + total_unique_words)

def sigmoid(z): 
    
    h =  1/(1 + np.exp(-z))
    
    return h

In [None]:
def predict(text, process_text_enabled = True):
    
    if process_text_enabled == True:
        text = process_text(text)
    
    score = 0
    for word in text:
        if word in NAIVE_BAYES_MAP:
            score += NAIVE_BAYES_MAP[word]['lambda']
        
    return score + LOG_PRIOR

## Create Naïve Bayes Map
***

In [None]:
NAIVE_BAYES_MAP, LOG_PRIOR = create_naive_bayes_map(train_data, process_text_enabled=True)

In [None]:
df_naive_bayes_map = pd.DataFrame(NAIVE_BAYES_MAP)

In [None]:
df_naive_bayes_map.T.head()

In [None]:
df_naive_bayes_map.T.describe()

## Words most likely to denote Disaster
***

In [None]:
df_naive_bayes_map.T.sort_values(by=['lambda'], ascending=False).head(10)

## Words less likely to denote Disaster
***

In [None]:
df_naive_bayes_map.T.sort_values(by=['lambda'], ascending=True).head(10)

## Test
***

In [None]:
idx = 15
target = train_data['target'][idx]
text = train_data['text'][idx]
probability = predict(text)

print(f'{target} : {sigmoid(probability)} - {text}')

## Submission
***

In [None]:
def create_submission(data : pd.DataFrame):
    data_dict = test_data.to_dict()
    
    submission = {'id': [], 'target' :[]}
    
    for idx in tqdm.tqdm_notebook(range(len(data_dict["id"]))):
        submission['id'].append(data_dict["id"][idx])
        
        pred = 1 if sigmoid(predict(data_dict["text"][idx])) >=0.5 else 0
        
        submission['target'].append(pred)
            
    return submission

In [None]:
submission = create_submission(test_data)

In [None]:
df_submission = pd.DataFrame(submission)
df_submission.to_csv('submission.csv', index = False)

In [None]:
df_submission