In [174]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score
from multiprocessing import Pool
import scipy as sc
from collections import Counter
import pandas as pd
pd.options.display.max_rows = 100
pd.options.display.max_columns =100
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import gensim
import pickle
from gensim.models import Word2Vec, FastText, KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import sys
sys.path.insert(0, '..')
from Common import preprocessing,evaluation,CosineClassifier as cos
classes_map = {'DOC':0, 'ENTER':1, 'ORG':2, 'PRIV':3, 'RANG':4, 'HOST':5}

In [8]:
df = pd.read_csv('..//Data//data.txt', delimiter=';', engine='python',encoding='utf8')

In [9]:
questions = np.array(df.question)
questions = preprocessing.preprocess_list(questions)

In [317]:
vectorizer = TfidfVectorizer(min_df=3,ngram_range=(1,1))
X = vectorizer.fit_transform(questions)

In [75]:
classes = np.array(df['class'])
y = list(map(lambda x: classes_map[x],classes))

In [422]:
X_train, X_test, y_train, y_test = train_test_split(def_embeddings, y, test_size=0.33, random_state=42,shuffle=True)

In [423]:
log_reg = OneVsRestClassifier(LogisticRegression(random_state=0,C=10,solver='lbfgs',)).fit(X_train, y_train)
ridge = OneVsRestClassifier(RidgeClassifier(random_state=0)).fit(X_train, y_train)
svc = OneVsRestClassifier(LinearSVC(random_state=0,)).fit(X_train, y_train)
#clf = cos.CosineClassifier().fit(X_train,y_train)

In [424]:
print(evaluation.get_CV_scores(log_reg,X_test,y_test,'f1_weighted').mean())
print(evaluation.get_CV_scores(ridge,X_test,y_test,'f1_weighted').mean())
print(evaluation.get_CV_scores(svc,X_test,y_test,'f1_weighted').mean())

0.7346750795468233
0.7325036348521543
0.7404943151159795


In [10]:
def predict (question,model):
    question = preprocessing.preprocess_list([question])[0]
    vect = vectorizer.transform([question])
    return model.predict_proba(vect)

In [392]:
count_model = CountVectorizer(min_df=3,ngram_range=(1,1))
X_ = count_model.fit_transform(questions)
Xc = (X_.T * X_)
Xc.setdiag(0)
print(Xc.todense().shape)

(752, 752)


In [39]:
df_co_occur = pd.DataFrame(Xc.todense(),columns=count_model.get_feature_names())
df_co_occur.index = count_model.get_feature_names()

In [24]:
pseudo_docs = {}
for i in range(Xc.todense().shape[0]):
    words = []
    for j in range(Xc.todense().shape[1]):
        weight = Xc.todense()[i,j]
        if weight > 1:
            words = words + ([count_model.get_feature_names()[j]]*weight)
    pseudo_docs[count_model.get_feature_names()[i]] = words.copy()
    words = []

In [48]:
model = Word2Vec([q.split() for q in questions], size=300, window=10,)
model.train([q.split() for q in questions],epochs=200,total_examples=model.corpus_count)

#model = KeyedVectors.load_word2vec_format("..//..//web_upos_cbow_300_20_2017.bin.gz", binary=True, unicode_errors='ignore')

(1582866, 2745800)

In [286]:
pseudo_docs_vec = {}
for word in pseudo_docs.keys():
    vec = (np.mean([model.wv[w] for w in pseudo_docs[word] if w in model.wv],axis=0))
    if not np.isnan(vec).any():
        pseudo_docs_vec[word] = vec

In [287]:
embeddings = []
cnt = 0
for q in questions:
    vec = [pseudo_docs_vec[w] for w in q.split() if w in list(pseudo_docs_vec.keys())]#*tfidf
    if len(vec) < 1:
        embeddings.append(np.zeros(300))
    else:
        embeddings.append(np.array(np.mean(vec,axis=0)))
    cnt = cnt +1

In [421]:
def_embeddings = []
cnt = 0
for q in questions:
    #vec = [model.wv[w]*tfidf[cnt][w] if w in list(tfidf[cnt]) else model.wv[w]*0 for w in q.split() if w in model.wv]
    vec = [enth[w][0] for w in q.split() if w in list(enth.keys())]
    if len(vec) < 1:
        def_embeddings.append(np.zeros(200))
    else:
        def_embeddings.append(np.array(np.mean(vec,axis=0)))
    cnt=cnt+1

In [17]:
tfidf = []
for i in range(X.shape[0]):
    dictionary = {}
    for j in range(len(vectorizer.get_feature_names())):
        dictionary[vectorizer.get_feature_names()[j]] = X[i,j]
    tfidf.append(dictionary.copy())

In [417]:
enthropy = []
for q in questions:
    question = q.split()
    q_dic = Counter(question)
    q_len = len(question)
    vector = []
    cnt=0
    for w in vectorizer.get_feature_names():
            if w in question:
                vector.append(sum(co_occur[cnt])*q_dic[w]/q_len*np.log2(q_dic[w]/q_len))
            else:
                vector.append(-0.00001*sum(co_occur[cnt]))
            cnt=cnt+1
    enthropy.append(vector.copy())
    

In [418]:
svd = TruncatedSVD(n_components=200,algorithm='arpack')
matr = svd.fit_transform(np.array(enthropy).T)

In [419]:
matr = Normalizer().fit_transform(matr)

In [420]:
enth = {}
for i in list(zip(vectorizer.get_feature_names(),matr)):
    enth[i[0]]=[i[1]]

In [397]:
co_occur = Normalizer().fit_transform(Xc.todense()) #MAKE IT PMI(u,v)