### 1

In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import heapq

from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import dask_searchcv as dcv
from dask.diagnostics import ProgressBar

categories = [
    'sci.electronics',
    'sci.space',
    'sci.med'
]
train_data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
test_data = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

In [2]:
pipeline = make_pipeline(CountVectorizer(min_df=5, ngram_range=(1, 2)), TfidfTransformer(), LogisticRegression())

In [3]:
pipeline.fit(train_data.data, train_data.target);

accuracy_score(pipeline.predict(train_data.data), train_data.target)

0.96962879640044997

In [50]:
accuracy_score(pipeline.predict(test_data.data), test_data.target)

0.82417582417582413

In [61]:
param_grid={"countvectorizer__min_df":[1],
            "countvectorizer__ngram_range":[[1,2]],
            "tfidftransformer__norm":["l2"],
            "logisticregression__C":[0.1],
            "countvectorizer__stop_words":[None,'english'],
            "countvectorizer__max_features":[None],
            "countvectorizer__min_df":[1],
            "countvectorizer__lowercase":['word'],
             "logisticregression__penalty":["l2"]
           }

In [62]:
grid=dcv.GridSearchCV(pipeline,param_grid,scoring='accuracy',cv=3,n_jobs=-1)

In [63]:
with ProgressBar():
    grid.fit(test_data.data,test_data.target)

[########################################] | 100% Completed | 13.2s


In [64]:
grid.best_score_

0.88503803888419275

In [66]:
grid.best_params_

{'countvectorizer__lowercase': 'word',
 'countvectorizer__max_features': None,
 'countvectorizer__min_df': 1,
 'countvectorizer__ngram_range': [1, 2],
 'countvectorizer__stop_words': 'english',
 'logisticregression__C': 0.1,
 'logisticregression__penalty': 'l2',
 'tfidftransformer__norm': 'l2'}

In [67]:
accuracy_score(grid.best_estimator_.predict(test_data.data), test_data.target)

0.9737954353338969

In [None]:
{
    "count_vectorizer_params": 
    {
        "min_df": 5,
        "ngram_range": [1, 2]
    }, 
    "tfidf_transformer_params": {
        "norm": "l1"    
    }, 
    "logistic_regression_params": {
        "C": 1
    }
}

In [3]:
%%time
#Чекер
# с вашими параметрами должен отработать за 1 минуту на машинке для проверки.
# Для сравнения на text_classification_params_example.json чекер работает 15 секунд.
from sklearn.model_selection import cross_val_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import numpy as np
import signal
import os
import json
import sys
import traceback


# SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
SCRIPT_DIR="/Users/roman/DMIA/industry/hw03/"


def signal_handler(signum, frame):
    raise Exception("Timed out!")


class Checker(object):
    def __init__(self):
        self.data = fetch_20newsgroups(
            subset='all', 
            categories=[
                'rec.autos',
                'rec.motorcycles',
                'rec.sport.baseball',
                'rec.sport.hockey'
            ], 
            remove=('headers', 'footers', 'quotes')
        )

    def check(self, params_path):
        try:
            with open(params_path, 'r') as f:
                params = json.load(f)

            signal.signal(signal.SIGALRM, signal_handler)
            signal.alarm(60)
            pipeline = make_pipeline(
                CountVectorizer(**params['count_vectorizer_params']), 
                TfidfTransformer(**params['tfidf_transformer_params']), 
                LogisticRegression(**params['logistic_regression_params'])
            )
            score = np.mean(cross_val_score(
                pipeline, 
                self.data.data, 
                self.data.target,
                scoring='accuracy', 
                cv=3
            ))
        except:
            traceback.print_exception(*sys.exc_info())
            score = None
        
        return score


if __name__ == '__main__':
#     print(Checker().check(SCRIPT_DIR + '/text_classification_params_example.json'))
    print(Checker().check(SCRIPT_DIR + '/text_classification_params_matiiv.json'))

0.854486222831
CPU times: user 8.21 s, sys: 254 ms, total: 8.47 s
Wall time: 7.27 s


### 2

In [1]:
from sklearn.datasets import make_classification
import numpy as np

X_data, y_data = make_classification(
            n_samples=10000, n_features=20, 
            n_classes=2, n_informative=20, 
            n_redundant=0,
            random_state=42
        )

size, dim = X_data.shape

random_gen = np.random.RandomState(777)
w = random_gen.rand(dim)
w0 = random_gen.randn()

In [4]:
1/(1+np.exp(np.dot(w,X_data[0])))

0.021156520492997814

In [5]:
for i in X_data[0]:
    print(np.sum(y_data-1/(1+np.exp(np.sum(X_data*w,axis=1))))*i)

-1410.75174871
529.994857836
294.940626138
-1004.37684016
88.8285496361
271.269035961
886.945728685
-515.747785874
-121.316550995
-563.385556806
88.1041489549
542.690421005
-503.034997813
-464.841847945
608.475646674
648.502572684
-635.588257107
-760.684508686
-441.949373465
-39.0141730084


In [26]:
[np.sum(y_data-1/(1+np.exp(np.sum(X_data*w,axis=1))))*i for i in X_data[0]]

[-1410.7517487083196,
 529.99485783627438,
 294.94062613847609,
 -1004.3768401646312,
 88.828549636126809,
 271.26903596055411,
 886.94572868515763,
 -515.74778587418291,
 -121.31655099507678,
 -563.38555680636284,
 88.104148954863959,
 542.69042100522711,
 -503.03499781269386,
 -464.84184794546883,
 608.47564667385939,
 648.50257268422104,
 -635.58825710742838,
 -760.68450868630907,
 -441.94937346500558,
 -39.014173008365233]

In [6]:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np
import os
import imp
import signal
import traceback
import sys


# SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
SCRIPT_DIR="/Users/roman/DMIA/industry/hw03/"




def signal_handler(signum, frame):
    raise Exception("Timed out!")


class Checker(object):
    def __init__(self):
        self.X_data, self.y_data = make_classification(
            n_samples=10000, n_features=20, 
            n_classes=2, n_informative=20, 
            n_redundant=0,
            random_state=42
        )
        self.applications = 0

    def check(self, script_path):
        try:
            signal.signal(signal.SIGALRM, signal_handler)
            signal.alarm(200)
            
            algo_impl = imp.load_source('logistic_regression_{}'.format(self.applications), script_path)
            self.applications += 1
            algo = algo_impl.MyLogisticRegression(**algo_impl.LR_PARAMS_DICT)
            return np.mean(cross_val_score(algo, self.X_data, self.y_data, cv=2, scoring='accuracy'))
        except:
            traceback.print_exception(*sys.exc_info())
            return None


if __name__ == '__main__':
#     print(Checker().check(SCRIPT_DIR + '/logistic_regression_example.py'))
#      print(Checker().check(SCRIPT_DIR + '/logistic_regression_matiiv.py'))
#      print(Checker().check(SCRIPT_DIR + '/logistic_regression_matiiv2.py'))
     print(Checker().check(SCRIPT_DIR + '/logistic_regression_likehood.py'))

  ders_w+=[np.sum(-y+1/(1+np.exp(np.sum(x*self.w,axis=1))))*i for i in x_i] # можно np.mean
  ders_w+=[np.sum(-y+1/(1+np.exp(np.sum(x*self.w,axis=1))))*i for i in x_i] # можно np.mean


0.4289


### 3

In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np
import os
import imp
import signal
import traceback
import sys
import json


# SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
SCRIPT_DIR="/Users/roman/DMIA/industry/hw03/"




def signal_handler(signum, frame):
    raise Exception("Timed out!")


class Checker(object):
    def __init__(self):
        # ВНИМАНИЕ !!!
        # При тестировании seed будет изменён
        # Не переобучитесь!
        random_gen = np.random.RandomState(42)
        
        weights = (0.05 + random_gen.exponential(0.75, size=15)) * 2
        X_data = random_gen.uniform(0., 4, size=(40, 15))
        errors = random_gen.normal(0., 2., size=40)

        split_pos = 25
        self.X_train = X_data[:split_pos]
        self.errors_train = errors[:split_pos]
        self.X_test = X_data[split_pos:]
        self.errors_test = errors[split_pos:]
        self.weights = weights

        self.applications = 0

    def check(self, script_path):
        try:
            signal.signal(signal.SIGALRM, signal_handler)
            signal.alarm(120)
            algo_impl = imp.load_source('algo_impl_{}'.format(self.applications), script_path)
            self.applications += 1
            algo = algo_impl.Optimizer()
            algo.fit(np.array(self.X_train), np.dot(self.X_train, self.weights) + self.errors_train)
            
            saved_moneys = 0.
            for budget, target_error in zip(self.X_test, self.errors_test):
                origin_budget = np.array(budget)
                optimized_budget = np.array(algo.optimize(origin_budget))

                if ((origin_budget * 0.95 <= optimized_budget) & (optimized_budget <= origin_budget * 1.05)).all():
                    if np.dot(optimized_budget, self.weights) >=  np.dot(origin_budget, self.weights):
                        saved_moneys += np.sum(origin_budget) - np.sum(optimized_budget)

            return saved_moneys
        except:
            traceback.prin
            t_exception(*sys.exc_info())
            return None


if __name__ == '__main__':
#     print(Checker().check(SCRIPT_DIR + '/ad_budget_example.py'))
    print(Checker().check(SCRIPT_DIR + '/ad_budget_matiiv.py'))

1.50448887039
