In [1]:
import nltk
import numpy as np
import pandas as pd
import time

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from os import path

In [2]:
subdir = './datasets/twitter/'
filename = 'bank_train.csv'
df = pd.read_csv(path.join(subdir,filename))
df_test = pd.read_csv(path.join(subdir, 'bank_test_etalon.csv'))
sentiment_dict = pd.read_csv('./datasets/sentiment_dict.csv')

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('russian')
#we can add stop words

import pymorphy2
def tokenize(text):
    text = text.lower()
    text_list = nltk.word_tokenize(text)
    morph = pymorphy2.MorphAnalyzer()
    text_list = [word for word in text_list if word[0] != '@' and len(word) > 1 and word.isalpha()]
    text_list = [morph.parse(word)[0].normal_form for word in text_list]
    return text_list

In [7]:
#tf-idf vectorizer

j = 0
token_list = [i for i in df['text']]
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words)
tfs = tfidf.fit_transform(token_list)

In [4]:
X_train_all, y_train_all = None, None
X_test_all, y_test_all = None, None

In [25]:
import ngram_vectorize
token_list = [ngram_vectorize.get_ngrams(i,n=3,is_join=True) for i in df['text']]
tfidf = TfidfVectorizer(stop_words=[])
tfs = tfidf.fit_transform(token_list)

In [26]:
def df_filter(_df, _tfs, corporation):
    x_arr = []
    y_arr = []
    for i in range(len(_df)):
        label = _df[corporation][i]
        if label == '0' or label == '-1' or label == '1' or label == 0.0 or label == -1.0 or label == 1.0:
            x_arr.append(_tfs[i].toarray()[0])
            y_arr.append(int(label))
    X = np.array(x_arr)
    y = np.array(y_arr)
    print (len(X))
    print (len(y))
    return X, y

In [27]:
companies = ['alfabank', 'gazprom', 'raiffeisen', 'rshb', 'sberbank', 'uralsib', 'vtb']
X_train, y_train = df_filter(_df=df, _tfs=tfs, corporation=companies[0])
for companie in companies[1:]:
    X_curr, y_curr = df_filter(_df=df, _tfs=tfs, corporation=companie)
    X_train = np.append(X_train, X_curr, axis=0)
    y_train = np.append(y_train, y_curr)

562
562
372
372
278
278
929
929
2038
2038
82
82
1257
1257


In [28]:
companies = ['alfabank', 'gazprom', 'raiffeisen', 'rshb', 'sberbank', 'uralsib', 'vtb']
X_test, y_test = df_filter(_df=df_test, _tfs=tfs, corporation=companies[0])
for companie in companies[1:]:
    X_curr, y_curr = df_filter(_df=df_test, _tfs=tfs, corporation=companie)
    if len(X_curr) > 0:
        X_test = np.append(X_test, X_curr, axis=0)
        y_test = np.append(y_test, y_curr)

536
536
137
137
150
150
99
99
2833
2833
60
60
651
651


In [29]:
X_train_all = np.concatenate((X_train_all, X_train),axis=1)
print (X_train_all.shape)
X_test_all = np.concatenate((X_test_all, X_test), axis=1)
print (X_test_all.shape)
y_train_all = y_train
y_test_all = y_test

(5518, 30598)
(4466, 30598)


In [32]:
len(sentiment_dict)

14487

In [33]:
print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print ('SVM: ')
start_time = time.time()
model = SVC()
model.fit(X_train, y_train)
t = int(time.time() - start_time)
print ('fit time = {0}m{1}sec'.format(t // 60, t % 60))
start_time = time.time()
y_predict = model.predict(X_test)
t = int(time.time() - start_time)
print ('predict time = {0}m{1}sec'.format(t // 60, t % 60))
print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SVM: 
fit time = 31m47sec
predict time = 22m14sec


  'precision', 'predicted', average, warn_for)


f-macro: 0.2914040836904462
f-micro: 0.7765338110165696


In [34]:
import my_score
func= lambda x,y: my_score.f_macro(my_score.cound_diff(x,y))
print ('strange f score - {0}'.format(func(y_predict, y_test)))

strange f score - 0.45631578947368423


In [35]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_predict)

array([[   0,  664,    0],
       [   0, 3468,    0],
       [   0,  334,    0]])

In [14]:
print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print ('Decision Tree: ')
start_time = time.time()
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
t = int(time.time() - start_time)
print ('fit time = {0} {1}'.format(t / 60, t % 60))
start_time = time.time()
y_predict = model.predict(X_test)
t = int(time.time() - start_time)
print ('predict time = {0} {1}'.format(t / 60, t % 60))
print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
print ('strange f score - {0}'.format(func(y_predict, y_test)))
print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print ('AdaBoost DecisionTree max_depth = 2, n = 100: ')
start_time = time.time()
model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5), n_estimators=100,
learning_rate=1)
model.fit(X_train, y_train)
t = int(time.time() - start_time)
print ('fit time = {0} {1}'.format(t / 60, t % 60))
start_time = time.time()
y_predict = model.predict(X_test)
t = int(time.time() - start_time)
print ('predict time = {0} {1}'.format(t / 60, t % 60))
print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
print ('strange f score - {0}'.format(func(y_predict, y_test)))

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Decision Tree: 
fit time = 0.1 6
predict time = 0.0 0
f-macro: 0.33241627485286734
f-micro: 0.5853552859618717
strange f score - 0.4235376068860566
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AdaBoost DecisionTree max_depth = 2, n = 100: 
fit time = 3.2666666666666666 16
predict time = 0.016666666666666666 1
f-macro: 0.3298327295503713
f-micro: 0.5899046793760832
strange f score - 0.4228240132659454


In [None]:
#countVectorizer
start_time = time.time()
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1, tokenizer=tokenize, stop_words=stop_words)
tfs4vec = vectorizer.fit_transform(token_list)
t = int(time.time() - start_time)
print ('TIME: {0}m {1}sec'.format(t / 60, t % 60))

In [None]:
companies = ['raiffeisen']
for companie in companies:
    print('!!!!!!!!!!!!!!!!!')
    print(companie)
    print ('~~~~~~~~~~~~~~~~~')
    
    X, y = df_filter(_df = df, _tfs = tfs, corporation = companie)
    X4, y4 = df_filter(_df = df, _tfs = tfs4vec, corporation = companie)
    X_test, y_test = df_filter(_df = df_test, _tfs = tfs, corporation = companie)
    X4_test, y4_test = df_filter(_df = df_test, _tfs = tfs4vec, corporation = companie)
    
    index = int((X.shape[0] + X_test.shape[0]) * 0.2)
    if (index < X_test.shape[0]):
        X = np.vstack((X, X_test[index:]))
        X_test = X_test[:index]
        y = np.concatenate((y,y_test[index:]))
        y_test = y_test[:index]
        X4 = np.vstack((X4, X4_test[index:]))
        X4_test = X4_test[:index]
        y4 = np.concatenate((y4,y4_test[index:]))
        y4_test = y4_test[:index]
    
    print ('DATA SIZE: TRAIN = {0}, TEST= {1}'.format(X.shape[0], X_test.shape[0]))
    print ('Naive Bayes: ')
    start_time = time.time()
    model = MultinomialNB()
    model.fit(X, y)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X_test)
    print ('predict time = {0} {1}'.format(t / 60, t % 60))
    print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
    t = int(time.time() - start_time)
    print('~~~~~count vectorizer~~~~~~')
    start_time = time.time()
    model = MultinomialNB()
    model.fit(X4, y4)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X4_test)
    t = int(time.time() - start_time)
    print ('predict time = {0} {1}'.format(t / 60, t % 60)) 
    print ('f-macro: {0}'.format(f1_score(y4_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y4_test, y_predict, average = 'micro')))
    print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print ('SVM: ')
    start_time = time.time()
    model = SVC()
    model.fit(X, y)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X_test)
    t = int(time.time() - start_time)
    print ('predict time = {0} {1}'.format(t / 60, t % 60))
    print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
    print('~~~~~count vectorizer~~~~~~')
    start_time = time.time()
    model = SVC()
    model.fit(X4, y4)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X4_test)
    t = int(time.time() - start_time) 
    print ('predict time = {0} {1}'.format(t / 60, t % 60)) 
    print ('f-macro: {0}'.format(f1_score(y4_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y4_test, y_predict, average = 'micro')))  
    print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print ('SVM linear kernel: ')
    start_time = time.time()
    model = SVC(kernel = 'linear')
    model.fit(X, y)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X_test)
    t = int(time.time() - start_time)
    print ('predict time = {0} {1}'.format(t / 60, t % 60))
    print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
    print('~~~~~count vectorizer~~~~~~')
    start_time = time.time()
    model = SVC(kernel = 'linear')
    model.fit(X4, y4)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X4_test)
    t = int(time.time() - start_time) 
    print ('predict time = {0} {1}'.format(t / 60, t % 60)) 
    print ('f-macro: {0}'.format(f1_score(y4_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y4_test, y_predict, average = 'micro')))
    print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print ('kNN 10: ')
    start_time = time.time()
    model = KNeighborsClassifier(n_neighbors=10)
    model.fit(X, y)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X_test)
    t = int(time.time() - start_time)
    print ('predict time = {0} {1}'.format(t / 60, t % 60))
    print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
    print('~~~~~count vectorizer~~~~~~')
    start_time = time.time()
    model =  KNeighborsClassifier(n_neighbors=10)
    model.fit(X4, y4)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X4_test)
    t = int(time.time() - start_time) 
    print ('predict time = {0} {1}'.format(t / 60, t % 60)) 
    print ('f-macro: {0}'.format(f1_score(y4_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y4_test, y_predict, average = 'micro')))
    print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print ('kNN 5: ')
    start_time = time.time()
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X, y)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X_test)
    t = int(time.time() - start_time)
    print ('predict time = {0} {1}'.format(t / 60, t % 60))
    print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
    print('~~~~~count vectorizer~~~~~~')
    start_time = time.time()
    model =  KNeighborsClassifier(n_neighbors=5)
    model.fit(X4, y4)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X4_test)
    t = int(time.time() - start_time) 
    print ('predict time = {0} {1}'.format(t / 60, t % 60)) 
    print ('f-macro: {0}'.format(f1_score(y4_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y4_test, y_predict, average = 'micro')))
    print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print ('kNN 3: ')
    start_time = time.time()
    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(X, y)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X_test)
    t = int(time.time() - start_time)
    print ('predict time = {0} {1}'.format(t / 60, t % 60))
    print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
    print('~~~~~count vectorizer~~~~~~')
    start_time = time.time()
    model =  KNeighborsClassifier(n_neighbors=3)
    model.fit(X4, y4)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X4_test)
    t = int(time.time() - start_time) 
    print ('predict time = {0} {1}'.format(t / 60, t % 60)) 
    print ('f-macro: {0}'.format(f1_score(y4_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y4_test, y_predict, average = 'micro')))
    print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print ('kNN 7: ')
    start_time = time.time()
    model = KNeighborsClassifier(n_neighbors=7)
    model.fit(X, y)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X_test)
    t = int(time.time() - start_time)
    print ('predict time = {0} {1}'.format(t / 60, t % 60))
    print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
    print('~~~~~count vectorizer~~~~~~')
    start_time = time.time()
    model =  KNeighborsClassifier(n_neighbors=7)
    model.fit(X4, y4)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X4_test)
    t = int(time.time() - start_time) 
    print ('predict time = {0} {1}'.format(t / 60, t % 60)) 
    print ('f-macro: {0}'.format(f1_score(y4_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y4_test, y_predict, average = 'micro')))
    print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print ('kNN 30: ')
    start_time = time.time()
    model = KNeighborsClassifier(n_neighbors=30)
    model.fit(X, y)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X_test)
    t = int(time.time() - start_time)
    print ('predict time = {0} {1}'.format(t / 60, t % 60))
    print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
    print('~~~~~count vectorizer~~~~~~')
    start_time = time.time()
    model =  KNeighborsClassifier(n_neighbors=30)
    model.fit(X4, y4)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X4_test)
    t = int(time.time() - start_time) 
    print ('predict time = {0} {1}'.format(t / 60, t % 60)) 
    print ('f-macro: {0}'.format(f1_score(y4_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y4_test, y_predict, average = 'micro')))
    print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print ('Decision Tree: ')
    start_time = time.time()
    model = DecisionTreeClassifier()
    model.fit(X, y)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X_test)
    t = int(time.time() - start_time)
    print ('predict time = {0} {1}'.format(t / 60, t % 60))
    print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
    print('~~~~~count vectorizer~~~~~~')
    start_time = time.time()
    model =  DecisionTreeClassifier()
    model.fit(X4, y4)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X4_test)
    t = int(time.time() - start_time) 
    print ('predict time = {0} {1}'.format(t / 60, t % 60)) 
    print ('f-macro: {0}'.format(f1_score(y4_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y4_test, y_predict, average = 'micro')))
    print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print ('Decision Tree max depth = 5: ')
    start_time = time.time()
    model = DecisionTreeClassifier(max_depth = 5)
    model.fit(X, y)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X_test)
    t = int(time.time() - start_time)
    print ('predict time = {0} {1}'.format(t / 60, t % 60))
    print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
    print('~~~~~count vectorizer~~~~~~')
    start_time = time.time()
    model =  DecisionTreeClassifier(max_depth = 5)
    model.fit(X4, y4)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X4_test)
    t = int(time.time() - start_time) 
    print ('predict time = {0} {1}'.format(t / 60, t % 60)) 
    print ('f-macro: {0}'.format(f1_score(y4_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y4_test, y_predict, average = 'micro')))
    print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print ('AdaBoost DecisionTree max_depth = 2, n = 100: ')
    start_time = time.time()
    model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=100,
    learning_rate=1)
    model.fit(X, y)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X_test)
    t = int(time.time() - start_time)
    print ('predict time = {0} {1}'.format(t / 60, t % 60))
    print ('f-macro: {0}'.format(f1_score(y_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y_test, y_predict, average = 'micro')))
    print('~~~~~count vectorizer~~~~~~')
    start_time = time.time()
    model =  AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=100,
    learning_rate=1)
    model.fit(X4, y4)
    t = int(time.time() - start_time)
    print ('fit time = {0} {1}'.format(t / 60, t % 60))
    start_time = time.time()
    y_predict = model.predict(X4_test)
    t = int(time.time() - start_time) 
    print ('predict time = {0} {1}'.format(t / 60, t % 60)) 
    print ('f-macro: {0}'.format(f1_score(y4_test, y_predict, average = 'macro')))
    print ('f-micro: {0}'.format(f1_score(y4_test, y_predict, average = 'micro')))