In [1]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict, Counter
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import arff
from sklearn.model_selection import train_test_split

from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics



In [2]:
# Load our data and try
with open('modified_data/just_tags.txt', 'rb') as f:
    just_tags = pickle.load(f)
    
with open('modified_data/just_words.txt', 'rb') as f:
    just_words = pickle.load(f)

In [3]:
np.unique(just_tags)

array(['EMT', 'EN', 'HI', 'UN'], dtype='<U3')

In [50]:
just_tags = np.array(just_tags).reshape(len(just_words),1)

In [63]:
def gen_features(data):
# Generating features
# Capitalization, length, suffixes 
    lens = [len(w) for w in data]

    caps = [1 if w[0].isupper() else 0 for w in data]

    num_caps = [sum([True for a in w if a.isupper()]) for w in data]

    suffixes = [w[-3:] for w in data]

    isdigit = [1 if w.isdigit() else 0 for w in data]
    
    feat_names = ['length', 'caps', 'num_caps', 'suffixes', 'isdigit']
    features = [lens, caps, num_caps, suffixes, isdigit]

#     features = pd.DataFrame(dict(zip(feat_names, features)))
    
    return(list(zip(lens, caps, num_caps, suffixes, isdigit)))
    
#     return (features)

In [81]:
len(just_words)

201412

In [52]:
# Create train test split
words_train, words_test, tags_train, tags_test = train_test_split(just_words, just_tags, random_state = 42, test_size = 0.2)

In [77]:
features_train = [list(i) for i in gen_features(words_train)]
features_test = gen_features(words_test)

In [78]:
features_train[0]

[1, 0, 0, 'i', 0]

In [55]:
features_train.shape

(161129, 5)

In [34]:
tags_train.shape

(161129, 1)

## Test CRF fit

In [79]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='l2sgd', 
#     c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)

Wall time: 0 ns


In [80]:
crf.fit(features_train, tags_train)

TypeError: object of type 'int' has no len()

In [77]:
features['target'] = just_tags

In [83]:
features['suffixes'] = features.suffixes.astype('category')

In [89]:
features['suffixes'][0]

str

In [90]:
# Save as ARFF file
arff.dump('word_features.arff'
      , features.values
      , relation = 'TrainFeatures'
      , names=features.columns)

In [91]:
for f in features:
    print(f)

length
caps
num_caps
suffixes
isdigit
target


In [96]:
' '.join([w for w in features['length']])

TypeError: sequence item 0: expected str instance, int found

In [100]:
file_to_write = '@relation TrainFeature\n'
for f in features:
    print(f)
    file_to_write += '@attribute ' + f
    line = ' '.join([str(w) for w in features[f]]) + '\n'
    
    file_to_write += line

length
caps
num_caps
suffixes
isdigit
target


In [111]:
with open('temp.txt', 'w+', encoding='utf-8') as f:
    f.write(','.join(['\''+str(w)+'\'' for w in list(np.unique(features['suffixes']))]))

In [70]:
# Save as CSV
features.to_csv('word_features.csv')

In [56]:
ohe_features = pd.get_dummies(features)

In [59]:
ohe_features.shape

(201412, 6346)

In [82]:
np.sum(features.isna())

length      0
caps        0
num_caps    0
suffixes    0
isdigit     0
target      0
dtype: int64

In [61]:
logreg_model = LogisticRegression()
logreg_model.fit(features.drop(columns = ['suffixes']), just_tags)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [62]:
logreg_model.score(features.drop(columns = ['suffixes']), just_tags)

0.5740124719480468

In [None]:
svc_model = SVC()
svc_model.fit(ohe_features, just_tags)