In [1]:
import pandas as pd

## 1. Create a simple text dataset

In [2]:
df_training_raw_data = pd.DataFrame({'sentence': [
                                        'A great game!', 
                                        'The election was over.', 
                                        'A very clean match.', 
                                        'A clean but forgettable game.', 
                                        'It was a close election.',
                                        'The match was exciting!',
                                        'Are you game?',
                                        'Every single vote counts.',
                                        'Have you voted yet?',
                                        'The election results will be out soon.'], 
                                     'tag': [
                                         'Sports', 
                                         'Non Sports', 
                                         'Sports', 
                                         'Sports', 
                                         'Non Sports',
                                         'Sports',
                                         'Sports',
                                         'Non Sports',
                                         'Non Sports',
                                         'Non Sports']})
df_training_raw_data

Unnamed: 0,sentence,tag
0,A great game!,Sports
1,The election was over.,Non Sports
2,A very clean match.,Sports
3,A clean but forgettable game.,Sports
4,It was a close election.,Non Sports
5,The match was exciting!,Sports
6,Are you game?,Sports
7,Every single vote counts.,Non Sports
8,Have you voted yet?,Non Sports
9,The election results will be out soon.,Non Sports


## 2. Preprocessing

### 2.1 Converting the tags to numbers

In [3]:
df_training_raw_data['num_tag'] = df_training_raw_data.tag.astype('category')
df_training_raw_data.num_tag = df_training_raw_data.num_tag.cat.codes

### 2.2 Cleaning the text

In [4]:
import spacy
import string
list_punctuation = [p for p in string.punctuation]
spacy_nlp = spacy.load('en')

def pre_process_text(str_text):
    sp_text = spacy_nlp(str_text)
    list_filtered_tokens = [token.lemma_ for token in sp_text if ((token.text.lower() not in spacy_nlp.Defaults.stop_words) and (token.text not in list_punctuation))]
    return ' '.join(list_filtered_tokens)

In [5]:
df_training_raw_data['sentence_cleaned'] = df_training_raw_data.sentence.apply(lambda x: pre_process_text(x))

In [6]:
df_training_raw_data

Unnamed: 0,sentence,tag,num_tag,sentence_cleaned
0,A great game!,Sports,1,great game
1,The election was over.,Non Sports,0,election
2,A very clean match.,Sports,1,clean match
3,A clean but forgettable game.,Sports,1,clean forgettable game
4,It was a close election.,Non Sports,0,close election
5,The match was exciting!,Sports,1,match exciting
6,Are you game?,Sports,1,game
7,Every single vote counts.,Non Sports,0,single vote count
8,Have you voted yet?,Non Sports,0,vote
9,The election results will be out soon.,Non Sports,0,election result soon


### 2.3 Text Representation using CountVectorizer() from sklearn

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
count_vectorizer.fit(df_training_raw_data.sentence_cleaned.tolist())
df_training_raw_data['sentence_vector'] = count_vectorizer.transform(df_training_raw_data.sentence_cleaned.tolist()).toarray().tolist()

In [8]:
df_training_raw_data

Unnamed: 0,sentence,tag,num_tag,sentence_cleaned,sentence_vector
0,A great game!,Sports,1,great game,"[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]"
1,The election was over.,Non Sports,0,election,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,A very clean match.,Sports,1,clean match,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
3,A clean but forgettable game.,Sports,1,clean forgettable game,"[1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]"
4,It was a close election.,Non Sports,0,close election,"[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,The match was exciting!,Sports,1,match exciting,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]"
6,Are you game?,Sports,1,game,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
7,Every single vote counts.,Non Sports,0,single vote count,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1]"
8,Have you voted yet?,Non Sports,0,vote,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
9,The election results will be out soon.,Non Sports,0,election result soon,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0]"


In [9]:
count_vectorizer.vocabulary_ # word: index number

{'great': 7,
 'game': 6,
 'election': 3,
 'clean': 0,
 'match': 8,
 'forgettable': 5,
 'close': 1,
 'exciting': 4,
 'single': 10,
 'vote': 12,
 'count': 2,
 'result': 9,
 'soon': 11}

## 3. Training and testing a GNB model

In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import numpy as np

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_training_raw_data, df_training_raw_data.num_tag, random_state=0)

In [12]:
X_train

Unnamed: 0,sentence,tag,num_tag,sentence_cleaned,sentence_vector
9,The election results will be out soon.,Non Sports,0,election result soon,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0]"
1,The election was over.,Non Sports,0,election,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6,Are you game?,Sports,1,game,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
7,Every single vote counts.,Non Sports,0,single vote count,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1]"
3,A clean but forgettable game.,Sports,1,clean forgettable game,"[1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]"
0,A great game!,Sports,1,great game,"[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]"
5,The match was exciting!,Sports,1,match exciting,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]"


In [13]:
X_test

Unnamed: 0,sentence,tag,num_tag,sentence_cleaned,sentence_vector
2,A very clean match.,Sports,1,clean match,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
8,Have you voted yet?,Non Sports,0,vote,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,It was a close election.,Non Sports,0,close election,"[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [14]:
y_train

9    0
1    0
6    1
7    0
3    1
0    1
5    1
Name: num_tag, dtype: int8

In [15]:
y_test

2    1
8    0
4    0
Name: num_tag, dtype: int8

In [16]:
gnb_model_1 = GaussianNB()
gnb_model_1.fit(np.array(X_train.sentence_vector.tolist()), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [17]:
y_hat = gnb_model_1.predict(np.array(X_test.sentence_vector.tolist()))
X_test['y_hat'] = y_hat
X_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,sentence,tag,num_tag,sentence_cleaned,sentence_vector,y_hat
2,A very clean match.,Sports,1,clean match,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",1
8,Have you voted yet?,Non Sports,0,vote,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",0
4,It was a close election.,Non Sports,0,close election,"[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0


In [18]:
new_sentences = ['A very close game.', 'A new paradigm in elections.', 'Nice election game!']
new_sentences_cleaned = [pre_process_text(ns) for ns in new_sentences]
new_sentences_vector = count_vectorizer.transform(new_sentences_cleaned).toarray()
new_sentences_vector

array([[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]])

In [19]:
gnb_model_1.predict(new_sentences_vector)

array([1, 0, 1], dtype=int8)