In [9]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
import numpy as np
# import mglearn
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import itertools
from nltk.corpus import treebank
from nltk.tag.sequential import ClassifierBasedPOSTagger

In [3]:
new_df = pd.read_csv('transcripts_final_y.csv')

In [5]:
import random
random.seed(0)
samples= list(new_df.person_id.unique())
num_samples = len(samples)
train_ids = random.sample(samples,round(0.9*num_samples))
test_ids = list(set(samples)-set(train_ids))

df_train=pd.DataFrame()
for i in train_ids:
    df_train=df_train.append(new_df[new_df['person_id']==i])

df_test=pd.DataFrame()
for i in test_ids:
    df_test=df_test.append(new_df[new_df['person_id']==i])

X_train = df_train.drop('y',axis=1)
X_test = df_test.drop('y',axis=1)
y_train = df_train['y']
y_test = df_test['y']

In [21]:
# -------------- Main code
train = X_train.copy()
train['y'] = y_train.values
nltk.download('treebank')
train_sents = treebank.tagged_sents()
tagger = ClassifierBasedPOSTagger(train=train_sents)
stemmer = SnowballStemmer('english')


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\asim.tewari\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [22]:
train.shape

(810, 11)

In [11]:
# Define tag sequences
SEQ_1 = "SEQ_1: {<DT|PP>?<JJ>*}"
SEQ_2 = "SEQ_2: {<NN><DT|PP\$>?<JJ>}"
SEQ_3 = "SEQ_3: {<NP>?<VERB>?<NP|JJ>}"
SEQ_4 = "SEQ_4: {<VB.*><NP|PP|CLAUSE>+$}"

cp1 = nltk.RegexpParser(SEQ_1)
cp2 = nltk.RegexpParser(SEQ_2)
cp3 = nltk.RegexpParser(SEQ_3)
cp4 = nltk.RegexpParser(SEQ_4)

lst_seq = list([cp1, cp2, cp3, cp4])

In [35]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


def get_number_of_spaces(sentence):
    return sentence.count(' ')

def get_number_of_fillers(sentence):
    sentence = sentence.lower()
    return sentence.count('um')+sentence.count('uh')+sentence.count('uhh')+sentence.count('umm')+sentence.count('uhm')

def get_number_of_other_fillers(sentence):
    sentence = sentence.lower()
    return sentence.count('so')+sentence.count('well')+sentence.count('oh')+sentence.count('in fact')+sentence.count('ok')

def get_number_of_capitals(sentence):
    n = sum(1 for c in sentence if c.isupper())
    return n


def get_number_of_nouns(taged_tokens):
    n = sum(1 for word, tag in taged_tokens if tag == 'NN' or tag == 'NNS' \
            or tag == 'NNP' or tag == 'NNP')
    return n


def get_number_of_adjectives(taged_tokens):
    n = sum(1 for word, tag in taged_tokens if tag == 'JJ' or tag == 'JJR' or tag == 'JJS')
    return n


def get_count_of_tagged(taged_tokens, tag_in):
    n = sum(1 for word, tag in taged_tokens if tag == tag_in)
    return n


def is_past_tense(taged_tokens):
    n = sum(1 for word, tag in taged_tokens if tag == 'VBD')
    return (n > 0)


def is_modal(taged_tokens):
    n = sum(1 for word, tag in taged_tokens if tag == 'MD')
    return (n > 0)


def vocab_richness(sentence):
    unique = set(sentence.split())
    count_uniques = len(unique)
    return count_uniques


def get_first_words(sentence, count):
    arr_words = sentence.split()
    ret_words = arr_words[:count]
    str_ret = ' '.join(ret_words)
    return str_ret


def get_one_word(sentence, position):
    arr_words = sentence.split()
    if len(arr_words) >= (position + 1):
        ret_word = arr_words[position]
        return ret_word
    else:
        return False


def exists_she(sentense):
    if 'she' in sentense.lower():
        return True
    else:
        return False


def exists_he(sentense):
    if 'he' in sentense.lower():
        return True
    else:
        return False



def first_tag(taged_tokens):
    return str(taged_tokens[0][1])


def second_tag(taged_tokens):
    if len(taged_tokens) > 1:
        return str(taged_tokens[1][1])
    else:
        return False


def third_tag(taged_tokens):
    if len(taged_tokens) > 2:
        return str(taged_tokens[2][1])
    else:
        return False

def get_consonant_letters(sentence):
    consonants = 0
    for word in sentence:
        for letter in word:
            if letter in 'bcdfghjklmnpqrstvwxyz':
                consonants += 1

    return consonants


def get_sonant_letters(sentence):
    sonants = 0
    for word in sentence:
        for letter in word:
            if letter in 'aieou':
                sonants += 1

    return sonants


def lexical_diversity(text):
    return len(set(text)) / len(text)


def get_sequence_tags(taged_tokens, n_sequence):
    countSequence = 0
    cp = lst_seq[n_sequence-1]
    result = cp.parse(taged_tokens)

    for tre in result:
        if isinstance(tre, nltk.tree.Tree):
            if tre.label() ==  cp._stages[0]._chunk_label:
                countSequence += 1

    return (countSequence > 0)

In [80]:
def get_sentence_features(sentens_in):
    stemmed_words = list()
    for w in sentens_in.split():
        stemmed_words.append(stemmer.stem(w))

    sentence = ' '.join(stemmed_words)
    word_tokens = nltk.wordpunct_tokenize(sentence)

    taged_tokens = tagger.tag(word_tokens)

    X_dict = {}

    X_dict['seq_01'] = get_sequence_tags(taged_tokens, 1)
    X_dict['seq_02'] = get_sequence_tags(taged_tokens, 2)
    X_dict['seq_03'] = get_sequence_tags(taged_tokens, 3)
    # X_dict['seq_04'] = get_sequence_tags(taged_tokens, 4)

    X_dict['lexical_diversity'] = lexical_diversity(sentence.lower())
    X_dict['get_consonant_letters'] = get_consonant_letters(sentence.lower())
    X_dict['get_sonant_letters'] = get_sonant_letters(sentence.lower())
    X_dict['count_of_fillers'] = get_number_of_fillers(sentence)
    X_dict['count_of_other_fillers'] = get_number_of_other_fillers(sentence)

    X_dict['count_of_spaces'] = get_number_of_spaces(sentence)
    X_dict['count_capitals'] = get_number_of_capitals(sentence)
    X_dict['count_nouns'] = get_number_of_nouns(taged_tokens)
    X_dict['count_adjectives'] = get_number_of_adjectives(taged_tokens)

    X_dict['count_numbers'] = get_count_of_tagged(taged_tokens, 'CD')
    X_dict['count_NNS'] = get_count_of_tagged(taged_tokens, 'NNS')
    X_dict['count_NNP'] = get_count_of_tagged(taged_tokens, 'NNP')
    X_dict['count_NNPS'] = get_count_of_tagged(taged_tokens, 'NNPS')
    X_dict['count_RBS'] = get_count_of_tagged(taged_tokens, 'RBS')
    X_dict['count_RBR'] = get_count_of_tagged(taged_tokens, 'RBR')
    X_dict['count_WP'] = get_count_of_tagged(taged_tokens, 'WP')
    X_dict['count_WP$'] = get_count_of_tagged(taged_tokens, 'WP$')
    X_dict['count_WRB'] = get_count_of_tagged(taged_tokens, 'WRB')
    X_dict['count_PRP'] = get_count_of_tagged(taged_tokens, 'PRP')
    X_dict['count_POS'] = get_count_of_tagged(taged_tokens, 'POS')
    X_dict['count_FW'] = get_count_of_tagged(taged_tokens, 'FW')
    X_dict['count_VB'] = get_count_of_tagged(taged_tokens, 'VB')
    X_dict['count_VBD'] = get_count_of_tagged(taged_tokens, 'VBD')
    X_dict['count_VBG'] = get_count_of_tagged(taged_tokens, 'VBG')
    X_dict['count_VBN'] = get_count_of_tagged(taged_tokens, 'VBN')
    X_dict['count_CC'] = get_count_of_tagged(taged_tokens, 'CC')

    X_dict['count_DT']         = get_count_of_tagged(taged_tokens, 'DT')
    X_dict['count_UH']         = get_count_of_tagged(taged_tokens, 'UH')
    X_dict['count_SYM']        = get_count_of_tagged(taged_tokens, 'SYM')
    X_dict['count_PDT']        = get_count_of_tagged(taged_tokens, 'PDT')
    X_dict['count_LS']         = get_count_of_tagged(taged_tokens, 'LS')

    X_dict['count_3rd person'] = get_count_of_tagged(taged_tokens, 'VBZ')
    X_dict['count_gerund'] = get_count_of_tagged(taged_tokens, 'VBG')

    X_dict['is_past_tense'] = is_past_tense(taged_tokens)
    X_dict['is_modal'] = is_modal(taged_tokens)
    X_dict['vocab_richness'] = vocab_richness(sentence)
    X_dict['first_tag'] = first_tag(taged_tokens)
    X_dict['second_tag'] = second_tag(taged_tokens)
    X_dict['third_tag'] = third_tag(taged_tokens)
    
#     X_dict['first_one_word'] = get_one_word(sentence, 0)
#     X_dict['second_one_word'] = get_one_word(sentence, 1)
#     X_dict['third_one_word'] = get_one_word(sentence, 2)
#     X_dict['forth_one_word'] = get_one_word(sentence, 3)
#     X_dict['fifth_one_word'] = get_one_word(sentence, 4)
#     X_dict['sixth_one_word'] = get_one_word(sentence, 5)
#     X_dict['seventh_one_word'] = get_one_word(sentence, 6)
#     X_dict['eith_one_word'] = get_one_word(sentence, 7)
#     X_dict['ninth_one_word'] = get_one_word(sentence, 8)
#     X_dict['tenth_one_word'] = get_one_word(sentence, 9)

#     X_dict['first_6_word'] = get_first_words(sentence, 6)
#     X_dict['first_5_word'] = get_first_words(sentence, 5)
#     X_dict['first_4_word'] = get_first_words(sentence, 4)
#     X_dict['first_3_word'] = get_first_words(sentence, 3)
#     X_dict['first_2_word'] = get_first_words(sentence, 2)

    X_dict['exists_she'] = exists_she(sentence)
    X_dict['exists_he']  = exists_he(sentence)

    X_dict['first_word_is_the'] = ('the' == get_first_words(sentence.lower(), 1))
    X_dict['first_word_is_she'] = ('she' == get_first_words(sentence.lower(), 1))
    X_dict['first_word_is_he']  = ('he' == get_first_words(sentence.lower(), 1))
    X_dict['first_word_is_it']  = ('it' == get_first_words(sentence.lower(), 1))
    X_dict['first_word_is_this'] = ('this' == get_first_words(sentence.lower(), 1))
    X_dict['first_word_is_you']  = ('you' == get_first_words(sentence.lower(), 1))
    X_dict['first_word_is_OK']  = ('OK' == get_first_words(sentence.lower(), 1))


    # X_dict['Raymond'] = ('raymond' in sentence.lower())
    # X_dict['Perdita'] = ('perdita' in sentence.lower())
    # X_dict['Idris']   = ('idris' in sentence.lower())
    # X_dict['Adrian']  = ('adrian' in sentence.lower())
    # X_dict['Chapter'] = ('chapter' in sentence.lower())
    # X_dict['sinister'] = ('sinister' in sentence.lower())
    # X_dict['weird']    = ('weird' in sentence.lower())
    # X_dict['horrible'] = ('horrible' in sentence.lower())

    return X_dict

In [81]:
from nltk.text import TextCollection

In [82]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train['y'].values)

In [39]:
# import random
# random.seed(0)
# samples= list(train.id.unique())
# num_samples = len(samples)
# train_ids = random.sample(samples,round(0.63*num_samples))
# val_ids = list(set(samples)-set(train_ids))

# df_train=pd.DataFrame()
# for i in train_ids:
#     df_train=df_train.append(train[train['id']==i])

# df_val=pd.DataFrame()
# for i in val_ids:
#     df_val=df_val.append(train[train['id']==i])

# xtrain = df_train.drop('y',axis=1)
# xval = df_val.drop('y',axis=1)
# ytrain = df_train['y']
# yval = df_val['y']

In [40]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.update( ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asim.tewari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asim.tewari\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [42]:
train['display'].shape

(810,)

In [83]:
def get_train_features(IN_x):
    Out_x= []
    index = 0

    for sentens_edna in IN_x:
        word_tokens1 = [i for i in word_tokenize(sentens_edna) if i not in stop_words]
        sentens_in = ' '.join(word_tokens1)
        X_feat_dict = get_sentence_features(sentens_in)
        Out_x.append(X_feat_dict)
        # Out_y.append(IN_y[index])
        index += 1

    return Out_x


# X_Train = get_train_features(train['display'])

In [84]:
all_features = get_train_features(new_df['display'])

In [85]:
all_features = pd.DataFrame(all_features)
all_features.head()

Unnamed: 0,seq_01,seq_02,seq_03,lexical_diversity,get_consonant_letters,get_sonant_letters,count_of_fillers,count_of_other_fillers,count_of_spaces,count_capitals,...,third_tag,exists_she,exists_he,first_word_is_the,first_word_is_she,first_word_is_he,first_word_is_it,first_word_is_this,first_word_is_you,first_word_is_OK
0,True,True,True,0.078078,163,109,0,6,58,0,...,RB,False,True,False,False,False,False,False,False,False
1,True,True,True,0.049407,249,169,0,8,88,0,...,RB,False,True,False,False,False,False,False,False,False
2,True,True,True,0.18018,56,33,0,2,22,0,...,NN,False,False,False,False,False,False,False,False,False
3,True,True,True,0.069164,164,116,0,2,66,0,...,VBP,False,True,False,False,False,False,False,False,False
4,True,True,True,0.204918,57,36,0,2,27,0,...,PRP,False,False,False,False,False,False,False,False,False


In [86]:
all_features.columns

Index(['seq_01', 'seq_02', 'seq_03', 'lexical_diversity',
       'get_consonant_letters', 'get_sonant_letters', 'count_of_fillers',
       'count_of_other_fillers', 'count_of_spaces', 'count_capitals',
       'count_nouns', 'count_adjectives', 'count_numbers', 'count_NNS',
       'count_NNP', 'count_NNPS', 'count_RBS', 'count_RBR', 'count_WP',
       'count_WP$', 'count_WRB', 'count_PRP', 'count_POS', 'count_FW',
       'count_VB', 'count_VBD', 'count_VBG', 'count_VBN', 'count_CC',
       'count_DT', 'count_UH', 'count_SYM', 'count_PDT', 'count_LS',
       'count_3rd person', 'count_gerund', 'is_past_tense', 'is_modal',
       'vocab_richness', 'first_tag', 'second_tag', 'third_tag', 'exists_she',
       'exists_he', 'first_word_is_the', 'first_word_is_she',
       'first_word_is_he', 'first_word_is_it', 'first_word_is_this',
       'first_word_is_you', 'first_word_is_OK'],
      dtype='object')

In [88]:
all_features_final = pd.concat([all_features.reset_index(),new_df[['id','y']].reset_index()],axis=1)
# X_d_test = pd.concat([X_data_test.reset_index(),X_test[['person_id']].reset_index(),y_test.reset_index()],axis=1)

In [90]:
all_features_final.drop(['index','index'],axis=1,inplace=True)

In [95]:
all_features_final

Unnamed: 0,seq_01,seq_02,seq_03,lexical_diversity,get_consonant_letters,get_sonant_letters,count_of_fillers,count_of_other_fillers,count_of_spaces,count_capitals,...,exists_he,first_word_is_the,first_word_is_she,first_word_is_he,first_word_is_it,first_word_is_this,first_word_is_you,first_word_is_OK,id,y
0,True,True,True,0.078078,163,109,0,6,58,0,...,True,False,False,False,False,False,False,False,1,1.0
1,True,True,True,0.049407,249,169,0,8,88,0,...,True,False,False,False,False,False,False,False,2,1.0
2,True,True,True,0.180180,56,33,0,2,22,0,...,False,False,False,False,False,False,False,False,3,1.0
3,True,True,True,0.069164,164,116,0,2,66,0,...,True,False,False,False,False,False,False,False,4,1.0
4,True,True,True,0.204918,57,36,0,2,27,0,...,False,False,False,False,False,False,False,False,5,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
892,False,False,False,0.789474,10,6,0,1,3,0,...,False,False,False,False,False,False,False,False,27,0.0
893,False,False,False,0.404762,21,12,0,0,8,0,...,False,False,False,False,False,False,False,False,28,0.0
894,False,False,False,1.000000,2,1,0,0,0,0,...,False,False,False,False,False,False,False,False,29,0.0
895,False,False,False,0.625000,8,5,0,0,3,0,...,False,False,False,False,False,False,False,False,30,0.0


In [94]:
all_features_final.to_csv('POSfeatures.csv',index = False)

In [55]:
xtrain = (xtrain.reset_index()).drop('index',axis=1)

In [56]:
xval = (xval.reset_index()).drop('index',axis=1)


In [59]:
def get_train_features(IN_x, IN_y):
    Out_x, Out_y = [], []
    index = 0

    for sentens_edna in IN_x:
        word_tokens1 = [i for i in word_tokenize(sentens_edna) if i not in stop_words]
        sentens_in = ' '.join(word_tokens1)
        X_feat_dict = get_sentence_features(sentens_in)
        Out_x.append(X_feat_dict)
        Out_y.append(IN_y[index])
        index += 1

    return Out_x, Out_y


X_Train, Y_train = get_train_features(xtrain['display'].reset_index(), ytrain.values)
X_valid, Y_valid = get_train_features(xval['display'], yval.values)

In [66]:
X_Train = pd.DataFrame(X_Train)
X_valid = pd.DataFrame(X_valid)


In [74]:
X_Train

Unnamed: 0,seq_01,seq_02,seq_03,lexical_diversity,get_consonant_letters,get_sonant_letters,count_of_fillers,count_of_other_fillers,count_of_spaces,count_capitals,...,first_2_word,exists_she,exists_he,first_word_is_the,first_word_is_she,first_word_is_he,first_word_is_it,first_word_is_this,first_word_is_you,first_word_is_OK
0,False,False,False,1.0,3,2,0,0,0,0,...,index,False,False,False,False,False,False,False,False,False
1,False,False,False,1.0,5,2,0,0,0,0,...,display,False,False,False,False,False,False,False,False,False


In [70]:
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],
         }

grid = GridSearchCV(BernoulliNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
grid.fit(X_Train, y_train.values)
# clf = grid.best_estimator_.named_steps['bernoullinb']
# coef = grid.best_estimator_.named_steps['bernoullinb'].coef_
# best_alpha = grid.best_estimator_.named_steps['bernoullinb'].alpha
# print("Best cross-validation alpha: {:.2f}".format(best_alpha))


ValueError: Found input variables with inconsistent numbers of samples: [2, 522]