## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

%matplotlib inline

## Getting the data

In [2]:
df = pd.read_json('content_baseline.json')
df

Unnamed: 0,Sentence,Presence
0,Help me!,1
1,Help me!,1
2,"""Front desk clerk Shawna Vela said she dialed ...",0
3,"I've been shot,' "" said Rosalinda Gonzalez, an...",0
4,Mid-Market and the Tenderloin are home to a th...,1
...,...,...
20181,"I'd flip open a cell phone, turn on its camera...",1
20182,.,0
20183,)The biggest challenge in documenting my dinin...,0
20184,"Ahumdinger TV season wrapped Wednesday night, ...",1


## Creating the training and test set

In [3]:
X = df['Sentence']
y = df['Presence']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

In [5]:
print("X_train shape",X_train.shape)
print("X_test shape",X_test.shape)
print("y_train shape",y_train.shape)
print("y_test shape",y_test.shape)

X_train shape (16148,)
X_test shape (4038,)
y_train shape (16148,)
y_test shape (4038,)


# Since this is the main dataset, we shall take the X_test, y_test as the main test data to run on all the models we create. 

# We shall us the X_train, y_train as the main training data. We will split this into train and test/validation set to and create a TF-IDF based Naive Bayes Model, use the test as the validation set and the Original test data as the test set and calculate the accuracy. We shall do the same for the other models too. 

## 1. TF-IDF based Naive Bayes Classifier

### Using X_train and y_train. Assign them to temporary variables and use those temporary variables x_1, y_1 respectively. 1 indicates the model number we are trying. 

In [6]:
X_1 = X_train
y_1 = y_train

In [7]:
X_1_train, X_1_val, y_1_train, y_1_val = train_test_split(X_1, y_1, test_size=0.2, random_state=42)

In [8]:
print("X_1_train shape",X_1_train.shape)
print("X_1_val shape",X_1_val.shape)
print("y_1_train shape",y_1_train.shape)
print("y_1_val shape",y_1_val.shape)

X_1_train shape (12918,)
X_1_val shape (3230,)
y_1_train shape (12918,)
y_1_val shape (3230,)


In [9]:
#Applying tf-idf vectorizer to train data
tfidf_vectorizer = TfidfVectorizer()

X_1_train_tfidf = tfidf_vectorizer.fit_transform(X_1_train)

In [10]:
#Applying the tf-idf vectorizer to validation data

X_1_val_tfidf = tfidf_vectorizer.transform(X_1_val)

In [11]:
#Applying Naive Bayes to the train data.

naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_1_train_tfidf, y_1_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
#Calculating accuracy and the classification report from the test data

y_pred = naive_bayes_classifier.predict(X_1_val_tfidf)

score = metrics.accuracy_score(y_1_val, y_pred)
print("Accuracy on the validation set: %0.3f" %score)

Accuracy on the validation set: 0.583


In [15]:
#Classification Report

print(metrics.classification_report(y_1_val, y_pred, target_names=['Positive', 'Negative']))

              precision    recall  f1-score   support

    Positive       0.55      0.83      0.66      1595
    Negative       0.68      0.34      0.45      1635

    accuracy                           0.58      3230
   macro avg       0.61      0.59      0.56      3230
weighted avg       0.61      0.58      0.56      3230



## We have got an accuracy of 58.3% on the validation set. Let us run this model on the original test set and see the accuracy. 

In [16]:
# We first have to convert the test set to a tfidf vector. 

X_1_test_tfidf = tfidf_vectorizer.transform(X_test)

In [18]:
# Predicting on the Naive Bayes classifier

y_1_test_pred = naive_bayes_classifier.predict(X_1_test_tfidf)

score_test = metrics.accuracy_score(y_test, y_1_test_pred)
print("Accuracy on the validation set: %0.3f" %score_test)

Accuracy on the validation set: 0.587


In [19]:
#Classification Report

print(metrics.classification_report(y_test, y_1_test_pred, target_names=['Positive', 'Negative']))

              precision    recall  f1-score   support

    Positive       0.56      0.83      0.67      2015
    Negative       0.67      0.35      0.46      2023

    accuracy                           0.59      4038
   macro avg       0.61      0.59      0.56      4038
weighted avg       0.61      0.59      0.56      4038



## We can see that on the main test set, the accuracy is 59%. So this is the baseline model.

https://iq.opengenus.org/naive-bayes-on-tf-idf-vectorized-matrix/

https://github.com/gabrieltseng/datascience-projects/tree/master/natural_language_processing/detecting_bullies
   
https://edumunozsala.github.io/BlogEms/jupyter/nlp/classification/embeddings/python/2020/08/15/Intro_NLP_WordEmbeddings_Classification.html