In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from math import floor
stopword = set(stopwords.words('english'))
porter = PorterStemmer()
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from collections import defaultdict
import pickle


In [2]:
def tokenize(text):
    words = word_tokenize(text) #split words
    words = [w.lower() for w in words if w.isalpha()] #get rid of punctuation
    words =[w for w in words if  not w in stopword]
    stemmed = [porter.stem(w) for w in words]
    stemmed = ' '.join(w for w in stemmed)
    return stemmed

def create_dataset():
    data = {}
    train = pd.read_csv(r"train.csv")#19579
    test = pd.read_csv(r"test.csv")#8392
    data["submit"] = pd.read_csv(r"sample_submission.csv")
    
    train['Token'] = train.text.map(tokenize)
    test['Token'] = test.text.map(tokenize)
    
    data['valid'] = train.loc[:floor((train.shape[0]*1)/3)]
    data['train'] = train.loc[floor((train.shape[0]*1)/3):]
    data['test'] = test
    
    data['valid'] = data['valid'].set_index('id')
    data['test'] = data['test'].set_index('id')
    data['train'] = data['train'].set_index('id')
    
    return data
    

In [3]:
data = create_dataset()
train = data['train']
valid = data['valid']
test = data['test']

In [17]:
def length(text):
    sen = text.split(".")
    sum_len = 0
    num = len(sen)
    for i in sen: 
        sum_len+= len(i.split(" "))
    return float(sum_len/num)


def length_of_sentence(dataset):
    avg_len = defaultdict(float)
    ids = dataset.index
    for i in ids:
        text = dataset.loc[i,'text']
        avg_len[i] = list([length(text)])
    return avg_len


In [9]:
list([1])

[1]

In [22]:
train.head()

Unnamed: 0_level_0,text,author,Token
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id23567,"He was the devil incarnate, Birch, and I belie...",HPL,devil incarn birch believ eye eye furi could b...
id26623,"We are ready to expose our breasts, exposed te...",MWS,readi expos breast expos ten thousand time bal...
id17046,"He continued, ""You must create a female for me...",MWS,continu must creat femal live interchang sympa...
id26351,And always the goal of my fancies was the migh...,HPL,alway goal fanci mighti vine grown wall littl ...
id13279,"While I remained motionless, and busied in end...",EAP,remain motionless busi endeavor collect though...


In [74]:
all_data = [train['Token'], test['Token'], valid['Token']]
df = pd.concat(all_data)


In [75]:
df = pd.DataFrame(df)

df['ind'] = list(range(0, df.shape[0]))
df['id'] = df.index
df = df.set_index('ind')
df.head()
pickle.dump(df, open('df.pkl', 'wb'))

In [84]:
def tfidf():
    df = pickle.load(open('df.pkl','rb'))
    vector= CountVectorizer(decode_error = 'ignore',stop_words = 'english', min_df = 25)
    # fit and transform the training set
    X_train = vector.fit_transform(list(df['Token']))
    
    # TFIDF model
    tfidf = TfidfTransformer()
    X_train_tfidf = tfidf.fit_transform(X_train)
    print (X_train_tfidf.shape)
    print(df.shape)
    df['tfidf'] = list(range(df.shape[0]))
    df['tfidf'] = df['tfidf'].astype(list)
    for i in range(df.shape[0]):
        i_row = X_train_tfidf[i,:].toarray().tolist()
        df.at[i,'tfidf'] = i_row
        
    df = df.set_index('id')
    print(df.head())
    pickle.dump(df,open("tfidf.pkl",'wb'))


In [85]:
tfidf()

(27972, 2592)
(27972, 2)


MemoryError: 

In [57]:
def tfidf_feature(dataset):
    tfidf()
    ids = dataset.index
    result = defaultdict(list)
    df = pickle.load(open("tfidf.pkl",'rb'))
    for i in ids:
        result[i] = df.loc[i,'tfidf']
    return result
    

In [40]:
def generate_feature_list(ids, feature_dicts):
    result = []
    feat_dict= defaultdict(list)
    for i in range(len(feature_dicts)):
        f_dict = feature_dicts[i]
        for j in ids:
            feat_dict[j]+= f_dict[j]
    
    for j in ids:
        #print (feat_dict[j])
        result.append(list([j , (feat_dict[j])]))
        
    return result

In [41]:
#features of a dataset, in format of [id, f1,f2....]
def generate_feature(dataset):
    ids = dataset.index
    f1 = length_of_sentence(dataset)
    f2 = tfidf_feature(dataset)
    features = [f1]
    result = generate_feature_list(ids, features)
    return result

In [42]:
def concat():
    
    hpl = train[train["author"] == "HPL"]
    mws = train[train["author"] == "MWS"]
    eap = train[train["author"] == "eap"]
    
    feature_hpl = generate_feature(hpl)
    feature_mws = generate_feature(mws)
    feature_eap = generate_feature(eap)
    
    feature_valid = generate_feature(valid)
    feature_test = generate_feature(test)
    
    pickle.dump(feature_hpl, open('feature_hpl.pickle', 'wb'))
    pickle.dump(feature_mws, open("feature_mws.pickle", 'wb'))
    pickle.dump(feature_eap, open('feature_eap.pickle', 'wb'))
    pickle.dump(feature_valid, open("feature_valid.pickle", 'wb'))
    pickle.dump(feature_test, open('feature_test.pickle','wb'))
    

In [58]:
concat()

1
[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

ValueError: Must have equal len keys and value when setting with an ndarray

In [12]:
hpl= pickle.load(open('feature_valid.pickle', 'rb'))
print(valid.loc[hpl[0][0], 'author'])

EAP


In [20]:
def train_data():
    feature_hpl= pickle.load(open('feature_hpl.pickle', 'rb'))
    feature_mws = pickle.load(open('feature_mws.pickle', 'rb'))
    feature_eap = pickle.load(open('feature_eap.pickle', 'rb'))
    feature_valid = pickle.load(open('feature_valid.pickle', 'rb'))
    
    
    feature = [x[1] for x in feature_hpl + feature_mws + feature_eap]
    features_valid = [x[1] for x in feature_valid]
    target = ['HPL' for i in range(len(feature_hpl))] + ['MWS' for i in range(len(feature_mws))] +['EAP' for i in range(len(feature_eap))]
    
    feature = np.array(feature)
    target = np.array(target)
    
    target_valid = list([valid.loc[i[0], "author"] for i in feature_valid])
    
    '''
    classifier = xgb.XGBClassifier(max_depth=5, n_estimators=300, learning_rate=0.05, objective="binary:logistic").fit(
        features, target)

    print("Saving the classifier")
    data_io.save_model(classifier)
    '''
    clf = MultinomialNB().fit(feature, target)
    
    predicted = clf.predict(features_valid)
    print (float(np.mean(predicted == target_valid)))
    
    pickle.dump(clf, open("classifier.pkl", 'wb'))
    
    
    
    

In [21]:
train_data()

0.3088708441856902


TypeError: file must have a 'write' attribute