In [277]:
import os
import sys
import numpy as np
import pandas as pd
from os.path import join
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import sklearn.naive_bayes
sys.path.append(os.path.abspath('..'))
## Self Imports
from src.utils import *
from nltk.tokenize import word_tokenize, regexp_tokenize
from numpy import argmax
from gensim.models.fasttext import FastText

In [8]:
?sklearn.naive_bayes

[1;31mType:[0m        module
[1;31mString form:[0m <module 'sklearn.naive_bayes' from 'C:\\Users\\vuquy\\Anaconda3\\lib\\site-packages\\sklearn\\naive_bayes.py'>
[1;31mFile:[0m        c:\users\vuquy\anaconda3\lib\site-packages\sklearn\naive_bayes.py
[1;31mDocstring:[0m  
The :mod:`sklearn.naive_bayes` module implements Naive Bayes algorithms. These
are supervised learning methods based on applying Bayes' theorem with strong
(naive) feature independence assumptions.


In [9]:
train_path = join('..', 'Data', 'Raw', 'train.csv')
test_path = join('..', 'Data', 'Raw', 'test.csv')
test_labels_path = join('..', 'Data', 'Raw', 'test_labels.csv')

In [10]:
df = pd.read_csv(train_path)

In [11]:
df_test = pd.read_csv(test_path)

In [12]:
df_test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [13]:
feats_test = df_test['comment_text']

In [14]:
feats_test.head()

0    Yo bitch Ja Rule is more succesful then you'll...
1    == From RfC == \n\n The title is fine as it is...
2    " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3    :If you have a look back at the source, the in...
4            I don't anonymously edit articles at all.
Name: comment_text, dtype: object

In [15]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [16]:
feature = df['comment_text']

In [17]:
feature.head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [14]:
labels = df.drop('comment_text', axis = 1)

In [15]:
labels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,0,0,0,0,0,0
1,000103f0d9cfb60f,0,0,0,0,0,0
2,000113f07ec002fd,0,0,0,0,0,0
3,0001b41b1c6bb37e,0,0,0,0,0,0
4,0001d958c54c6e35,0,0,0,0,0,0


In [16]:
labels.columns

Index(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [16]:
labels['toxic'].mean()

0.09584448302009764

In [17]:
df.shape

(159571, 8)

In [18]:
df_test.shape

(153164, 2)

# Using CountVectorizer

In [19]:
vectorizer1 = CountVectorizer(min_df = 0.001, stop_words='english')
X = vectorizer1.fit_transform(feature)

In [20]:
print(vectorizer1.get_feature_names()[:5])

['00', '000', '01', '02', '03']


In [21]:
X.shape

(159571, 3647)

In [22]:
X.data.nbytes/1024/1024

23.413787841796875

In [23]:
type(X)

scipy.sparse.csr.csr_matrix

In [24]:
print(X.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [25]:
vectorizer1.get_feature_names()[:10]

['00', '000', '01', '02', '03', '04', '05', '06', '07', '08']

# Multinomial NB with Count Vectorizer

In [26]:
xtr, xte, ytr, yte = train_test_split(X, labels['toxic'])

In [27]:
clf = MultinomialNB()
clf.fit(xtr, ytr)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [28]:
train_pred = clf.predict(xtr)
train_actual = ytr.values
train_error = np.mean(abs(train_pred - train_actual))
accuracy = 1 - train_error

In [29]:
accuracy

0.9433396280018048

In [30]:
sensitivity(train_actual, train_pred)

0.691886250978346

In [31]:
specificity(train_actual, train_pred)

0.9700681278251786

In [32]:
negative_predictive_value(train_actual, train_pred)

0.9673408060174772

In [33]:
precision(train_actual, train_pred)

0.7107378953010541

In [34]:
f1_score(train_actual, train_pred)

0.7011853875644473

In [35]:
test_pred = clf.predict(xte)
test_actual = yte.values
test_error = np.mean(abs(test_pred - test_actual))
t_accuracy = 1 - test_error

In [36]:
t_accuracy

0.9400897400546462

In [37]:
sensitivity(test_actual, test_pred)

0.6669301712779974

In [38]:
specificity(test_actual, test_pred)

0.9688071361294255

In [39]:
negative_predictive_value(test_actual, test_pred)

0.9651175626448836

In [40]:
precision(test_actual, test_pred)

0.6920973475526387

In [41]:
f1_score(test_actual, test_pred)

0.6792807300053677

# Using TfidfVectorizer

In [79]:
vectorizer2 = TfidfVectorizer(min_df =0.001, stop_words = 'english')
Y = vectorizer2.fit_transform(feature)

In [43]:
Y.shape

(159571, 3647)

In [44]:
print(vectorizer2.get_feature_names()[:9])

['00', '000', '01', '02', '03', '04', '05', '06', '07']


In [45]:
# print(Y.toarray())

# Multinomial NB with Tf-idf Vectorizer

In [46]:
y1, y2, z1, z2 = train_test_split(Y, labels['toxic'])

In [47]:
clf1 = MultinomialNB()
clf1.fit(y1, z1)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [48]:
train_pred1 = clf1.predict(y1)
train_actual1 = z1.values
train_error1 = np.mean(abs(train_pred1 - train_actual1))
accuracy1 = 1 - train_error1

In [49]:
accuracy1

0.9481107638830862

In [50]:
testing_df = pd.DataFrame(['You Suck', 'Hello Friends'])

In [51]:
testing_X = vectorizer2.transform(testing_df[0])

In [52]:
pred_result = clf1.predict_proba(testing_X)

In [53]:
pred_result

array([[0.04411346, 0.95588654],
       [0.85028361, 0.14971639]])

In [54]:
pred_result.max()

0.9558865368625948

In [55]:
for x in range(len(pred_result)):
    m = pred_result[x, 1]
    print(m)

0.9558865368625948
0.14971638503881088


In [56]:
test_pred1 = clf.predict(y2)
test_actual1 = z2.values
test_error1 = np.mean(abs(test_pred1 - test_actual1))
t_accuracy1 = 1 - test_error1
t_accuracy1

0.9474343869851853

In [57]:
sensitivity(test_actual1, test_pred1)

0.4885057471264368

In [58]:
specificity(test_actual1, test_pred1)

0.9961458477748509

In [59]:
negative_predictive_value(test_actual1, test_pred1)

0.9483159117305459

In [60]:
precision(test_actual1, test_pred1)

0.9308113489298159

In [61]:
f1_score(test_actual1, test_pred1)

0.6407401062189482

### Evaluation on different lables

In [84]:
for label in labels.drop('id', axis = 1).columns:
    xtr, xte, ytr, yte = train_test_split(Y, labels[label])
    mnb = MultinomialNB()
    mnb.fit(xtr, ytr)
    
    # Evaluation on training set
    train_pred = mnb.predict(xtr)
    train_actual = ytr.values
    train_error = np.mean(abs(train_pred - train_actual))
    accuracy = 1 - train_error
    print(f'*Evaluation metrics for {label}: ')
    print('**On training set')
    print(f'Accuracy: {accuracy}, Sensitivity: {sensitivity(train_actual, train_pred)}, Specificity: {specificity(train_actual, train_pred)}, NPV: {negative_predictive_value(train_actual, train_pred)}, Precison: {precision(train_actual, train_pred)}, F1 Score: {f1_score(train_actual, train_pred)}')
    
    #Evaluation on test set
    test_pred = mnb.predict(xte)
    test_actual = yte.values
    test_error = np.mean(abs(test_pred - test_actual))
    t_accuracy = 1 - test_error
    t_accuracy
    print('**On test set')
    print(f'Accuracy: {t_accuracy}, Sensitivity: {sensitivity(test_actual, test_pred)}, Specificity: {specificity(test_actual, test_pred)}, NPV: {negative_predictive_value(test_actual, test_pred)}, Precison: {precision(test_actual, test_pred)}, F1 Score: {f1_score(test_actual, test_pred)}')

*Evaluation metrics for toxic: 
**On training set
Accuracy: 0.9482695232206421, Sensitivity: 0.49773991655076494, Specificity: 0.9961820770240538, NPV: 0.9491099974458116, Precison: 0.9327251995438997, F1 Score: 0.6490959587371762
**On test set
Accuracy: 0.947810392800742, Sensitivity: 0.4857519788918206, Specificity: 0.9963160956153229, NPV: 0.9486009652153274, Precison: 0.9326241134751773, F1 Score: 0.6387925052047189
*Evaluation metrics for severe_toxic: 
**On training set
Accuracy: 0.9906081318203847, Sensitivity: 0.195162635529608, Specificity: 0.9986579900235485, NPV: 0.9919101311983904, Precison: 0.5954198473282443, F1 Score: 0.29396984924623115
**On test set
Accuracy: 0.9906499887198256, Sensitivity: 0.19444444444444445, Specificity: 0.9986328075550042, NPV: 0.9919772647251144, Precison: 0.5877862595419847, F1 Score: 0.2922201138519924
*Evaluation metrics for obscene: 
**On training set
Accuracy: 0.973587459683484, Sensitivity: 0.5480387486104494, Specificity: 0.997221756731727

ZeroDivisionError: division by zero

In [214]:
x

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,0,0,0,0,0,0
1,000103f0d9cfb60f,0,0,0,0,0,0
2,000113f07ec002fd,0,0,0,0,0,0
3,0001b41b1c6bb37e,0,0,0,0,0,0
4,0001d958c54c6e35,0,0,0,0,0,0


In [64]:
def _gen_feats(*args, **kwargs):
    """
    Generate features for training and testing
    
    Args:
        args[0]: features (comment_text) of training set
        args[1]: features of test set
        args[2]: chosen type of Vectorizer
        kwargs: hyper parameters used for Vectorizer
        
    Returns:
        Features for training (X) and testing (Xte)
        
    """

    
    vectorizer = args[2](kwargs)
    X = vectorizer.fit_transform(args[0])
    Xte = vectorizer.transform(args[1])
    return X, Xte

In [65]:
# Worked
# _gen_feats(feature, feats_test, TfidfVectorizer, vect_kwargs = {'min_df':0.001, 'stop_words':'english'})

In [66]:
def _get_y_dropcols(*args):
    """
    Extracts all label fields & ID.
    
    Args:
        arg[0]: Raw training dataframe with labels, ID, & comment text
        arg[1]: columns to drop 
        arg[2]: label to predict on
    
    Returns:
    
    """
  
    # Save dropped columns into new variable before dropping
    cmt_id = args[0][args[1]]
    
    # Create new dataframe without dropped columns

    for label in args[0].columns:
        y = args[0][args[2]]
    
    return y, cmt_id


In [67]:
# Worked
# _get_y_dropcols(labels, 'id' )

In [68]:
def preprocessing(*args, **kwargs):
    """
    Preprocessing the raw data to get data ready to be trained and tested
    
    Args:
        args[0]: features (comment_text) of training set
        args[1]: features of test set
        args[2]: chosen type of Vectorizer
        args[3]: labels of training set
        args[4]: columns to drop from label (id)
        args[5]: label to predict on
        kwargs: hyper parameters used for Vectorizer
        
    Returns:
        Training data (X, y), test data(Xte), dropped columns
        
    """
        
    X, Xte = _gen_feats(args[0], args[1], args[2], kwargs)
    y, cmt_id = _get_y_dropcols(args[3], args[4], args[5])
        
    return X, Xte, y, cmt_id

In [69]:
# Worked
# preprocessing(feature, feats_test, TfidfVectorizer, labels, 'id', 
#               vect_kwargs = {'min_df':0.001, 'stop_words':'english'})

In [70]:
def _gen_model(X, y, model_type):
    """
    
    Generate a fitted model ready to make prediction
    
    X: training features (comment_text)
    y: training lables
    
    Returns:
        A model ready for predicting test set
    
    """
    
    # Initialize model type with "model"
    model = model_type()
    # Fitting model with training data X, y
    model.fit(X, y)
    return model

In [71]:
# U better work man
# _gen_model(X, y, MultinomialNB)

In [72]:
def _gen_preds(Xte, model):
    """
    Make prediction on test set and return the predicted probability for the testing label
    
    Args: 
        Xte: test data set with only comment_text column
        model: fitted model generated in gen_model
        
    Returns
        (float) 
    
    """

    pred_result = model.predict_proba(Xte)
    prob = pred_result[:, 1]
    return prob       

In [73]:
def output_transform(raw_output, *args):

    """
    Transform raw output to appropriate form for submission 
    
    Args:
        raw_output:(dataframe) output from _gen_preds function
        args: columns' name
        
    Return:
        Transformed dataframe ready for converting to csv 
        
    """
    # Transpose the output so labels are columns and comments are rows
    trans_df = raw_output.T
    # Rename all columns
    trans_df.columns = args
    # Insert the id column from df_test at position 0 to the left 
    trans_df.insert(0, 'id', df_test['id'], True)
    # Reset index to start from 1 instead of 0
    trans_df.index += 1 
    
    return trans_df

In [74]:
# Creating a path to submission file
save_path01 = join('..', 'Results', 'simple_subm01.csv')

In [75]:
def _gen_kaggle_sub(feature, feats_test, lbs, save_path):
    """
    Generate a csv file to submit to Kaggle
    
    Args:
        lbs: labels of orginal df
        save_path: path to save location
        
    Returns:
        csv file
        
    """
    # Preprocessing the raw data to get data ready to be trained and tested
    lb_proba = []
    for label in lbs.columns:
        X, Xte, y, cmt_id = preprocessing(feature, feats_test, TfidfVectorizer, labels, 'id', label, 
                                          vect_kwargs = {'min_df':0.001, 'stop_words':'english'})
    
        # Generate a fitted model ready to make prediction
        model = _gen_model(X, y, MultinomialNB)
        # Make prediction on test set and return the predicted probability for the testing label
        prob = _gen_preds(Xte, model)
        lb_proba.append(prob)
    
    # Transform the output array to dataframe
    raw_pred_df = pd.DataFrame(lb_proba)
    
    # Transform to appropriate form for submission 
    sub_df = output_transform(raw_pred_df,
                              'toxic', 'severe_toxic', 'obscene', 'threat', 'insult','identity_hate')
    
    sub_df.to_csv(save_path, index = False)
    return sub_df

In [76]:
_gen_kaggle_sub(df['comment_text'], df_test['comment_text'], labels.drop('id', axis=1), save_path01)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
1,00001cee341fdb12,0.401214,4.801955e-07,2.293768e-02,3.035624e-09,6.842720e-03,4.848761e-07
2,0000247867823ef7,0.000854,1.126907e-06,1.285176e-04,1.265841e-07,1.037182e-04,1.525776e-06
3,00013b17ad220c46,0.110318,7.497661e-03,6.060326e-02,1.944498e-03,5.686125e-02,7.324074e-03
4,00017563c3f7919a,0.000071,2.055300e-09,4.193695e-06,7.595696e-11,3.228442e-06,1.559327e-09
5,00017695ad8997eb,0.005842,2.354547e-05,1.500915e-03,2.695010e-06,1.224030e-03,1.932940e-05
6,0001ea8717f6de06,0.000396,2.649962e-08,2.843496e-05,2.190758e-09,2.418763e-05,3.822227e-08
7,00024115d4cbde0f,0.000057,9.143020e-10,3.753479e-06,2.871155e-11,2.464680e-06,9.004787e-10
8,000247e83dcc1211,0.060142,3.967158e-05,8.073634e-03,5.068992e-06,7.758652e-03,4.161255e-05
9,00025358d4737918,0.000110,9.945701e-10,4.062479e-06,6.163175e-11,2.477330e-06,1.468281e-09
10,00026d1092fe71cc,0.000086,6.547442e-09,7.491110e-06,6.159895e-10,5.475115e-06,9.274395e-09


In [None]:
def _gen_all_predictions(ft, lbs, test_df):
    """
    Description of function
    
    Args:
        ft (type_of_ft): Description of ft
        lbs (type_of_lbs): Description of lbs
        
    Returns
        (Type of return) Description of return
    
    """
    vectorizer2 = TfidfVectorizer(min_df=0.001, stop_words = 'english')
    X = vectorizer2.fit_transform(ft)
    ft_test = test_df['comment_text']    
    Xte = vectorizer2.transform(ft_test)
    
    lb_proba = []
    for label in tqdm(lbs.columns):
        y = lbs[label]
#         Xtr, Xte, ytr, yte = train_test_split(X, y)
        clf = MultinomialNB()
        clf.fit(X, y)

        # Make prediction on test set and return the highest probability        
        pred_result = clf.predict_proba(Xte)
        prob = pred_result[:, 1]
        lb_proba.append(prob)

    return lb_proba

In [None]:
AAA = pd.DataFrame(_gen_all_predictions(feature, labels.drop('id', axis=1), df_test))

In [None]:
AAA.T.head()

In [None]:
AAA.shape

In [None]:
BBB = AAA.T

In [None]:
BBB.columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult','identity_hate']
BBB.insert(0, 'id', df_test['id'], True)
BBB.index += 1

In [None]:
BBB.head(10)

In [76]:
BBB.shape

(153164, 7)

In [77]:
# Creating a path to submission file
save_path = join('..', 'Results', 'simple_subm.csv')

In [80]:
def write_kaggle_submission(ft, lbs, test_df, save_path):
    raw_pred_df = pd.DataFrame(_gen_all_predictions(ft, lbs.drop('id', axis=1), test_df))
    sub_df = raw_pred_df.T
    sub_df.columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult','identity_hate']
    sub_df.insert(0, 'id', df_test['id'], True)
    sub_df.index += 1 
    
    sub_df.to_csv(save_path, index = False)
    return sub_df

In [81]:
write_kaggle_submission(feature, labels, df_test, save_path)

100%|██████████| 6/6 [00:00<00:00, 13.78it/s]


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
1,00001cee341fdb12,0.997059,0.552809,0.993161,0.034322,0.978686,0.499119
2,0000247867823ef7,0.017104,0.002524,0.007965,0.001360,0.008103,0.002858
3,00013b17ad220c46,0.028479,0.000969,0.012154,0.000284,0.011852,0.001526
4,00017563c3f7919a,0.009361,0.000825,0.004365,0.000221,0.004474,0.000524
5,00017695ad8997eb,0.060022,0.001280,0.024679,0.000238,0.021797,0.000992
6,0001ea8717f6de06,0.020556,0.000749,0.007816,0.000270,0.008260,0.000792
7,00024115d4cbde0f,0.009431,0.000249,0.004904,0.000043,0.003987,0.000185
8,000247e83dcc1211,0.454314,0.015219,0.210251,0.006094,0.238466,0.016933
9,00025358d4737918,0.023433,0.000845,0.008921,0.000616,0.007025,0.001274
10,00026d1092fe71cc,0.010371,0.000864,0.006145,0.000554,0.005083,0.001214


# Tokenizing

In [18]:
bala = feature[:5]
bala

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [263]:
class Tknz():
    
    def __init__(self, comments):
        self.comments = comments
        
    def get_tknized(comments):   
        """ 
        Create a nested list of tokenized comments
        Input: a series of text comments
        Returns: a nested list of words of each comment
        """
        com_tok = []
        for i in range(len(comments)):
            com = regexp_tokenize(comments[i], '[\S]+')
            com_tok.append(com)
        return com_tok

    def get_max_com_len(com_tok):   
        """ 
        Get list of comment's lenghths
        Input: a nested list of tokenized comments
        Returns: 
            - len_com: a list of respective lengths
            - max_len: lenghth of the longest comment
        """
        len_com = []
        for i in com_tok:
            l = len(i)
            len_com.append(l)
        max_com_len = max(len_com)
        return len_com, max_com_len
    
    def get_max_tok_len(com_tok):
        """
        Get the length of the longest tokens for empty string initilization
        Input: a nested list of tokenized comments
        Returns: (int) lenghth of longest tokens
        """
        q = []
        for i in range(len(com_tok)):
            for j in range(len(com_tok[i])):
                #List of every single element's length
                q += [len(com_tok[i][j])]
                max_tok_len = max(q)
        return max_tok_len
    
    def gen_com_array(comments, max_len, max_tok_len):
        """
        Create an array of shape number of comments x length of max comment
        Input: 
        - A series of comments
        - Lenghth of longest comment
        - Lenght of longest token
        Returns: an array with tokens of each comment as a dimension
        """
        
        #Initialize a zero array
        full_arr = np.zeros((len(comments), max_len), dtype = f'U{max_tok_len}')
        
        # Fill array with tokenized comments, one for each dimension
        for i in range(len(comments)):
            for j in range(max_len):
                max_com = com_tok[i]
                a = np.pad(max_com, (0, max_len-len(max_com)), 'constant', constant_values = np.nan)
            full_arr[i] = a
        return full_arr

In [269]:
com_tok = Tknz.get_tknized(bala)
len_com, max_com_len = Tknz.get_max_com_len(com_tok)
max_tok_len = Tknz.get_max_tok_len(com_tok)
ar_01 = Tknz.gen_com_array(bala, max_com_len, max_tok_len)
ar_01

array([['Explanation', 'Why', 'the', 'edits', 'made', 'under', 'my',
        'username', 'Hardcore', 'Metallica', 'Fan', 'were', 'reverted?',
        'They', "weren't", 'vandalisms,', 'just', 'closure', 'on',
        'some', 'GAs', 'after', 'I', 'voted', 'at', 'New', 'York',
        'Dolls', 'FAC.', 'And', 'please', "don't", 'remove', 'the',
        'template', 'from', 'the', 'talk', 'page', 'since', "I'm",
        'retired', 'now.89.205.38.27', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan',
        'nan', 'na

In [278]:
sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]

model = FastText(sentences, min_count=1)
say_vector = model.wv['say']  # get vector for word
of_vector = model.wv['of']  # get vector for out-of-vocab word


  "C extension not loaded, training will be slow. "
