# Функции потерь и оптимизация

##  1. Импорт необходимых библиотек

In [588]:
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
!pip install xlwt

Collecting https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
  Using cached https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
  Preparing metadata (setup.py) ... [?25ldone


In [604]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import math
from sklearn.metrics import f1_score
import copy

# 2. Обработка данных

In [590]:
df=pd.read_csv('https://forge.scilab.org/index.php/p/rdataset/source/file/master/csv/datasets/iris.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,1,5.1,3.5,1.4,0.2,setosa
1,2,4.9,3.0,1.4,0.2,setosa
2,3,4.7,3.2,1.3,0.2,setosa
3,4,4.6,3.1,1.5,0.2,setosa
4,5,5.0,3.6,1.4,0.2,setosa


In [591]:
df.shape

(150, 6)

In [592]:
df.dtypes

Unnamed: 0        int64
Sepal.Length    float64
Sepal.Width     float64
Petal.Length    float64
Petal.Width     float64
Species          object
dtype: object

In [593]:
df.Species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [595]:
df = df[df.Species != 'setosa']
df = df.drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor


In [596]:
le =LabelEncoder()
sc = StandardScaler()
numeric_cols=['Sepal.Length','Sepal.Width','Petal.Length','Petal.Width']
y='Species'
df[y]=le.fit_transform(df[y])
df[numeric_cols]=sc.fit_transform(df[numeric_cols])
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
50,1.119009,0.990688,-0.250779,-0.653039,0
51,0.209246,0.990688,-0.494254,-0.416431,0
52,0.967382,0.688649,-0.007304,-0.416431,0
53,-1.1554,-1.727663,-1.102941,-0.889647,0
54,0.360873,-0.217468,-0.372516,-0.416431,0


In [597]:
X=df.drop('Species',axis=1)
y=df['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 3. Создание модели логистической регресии

In [600]:
import pandas as pd
import numpy as np

class LogisticRegression:
    _coef=[]
    _lr=0
    _epoches=0
    _method=''
    __X=[]
    __y=[]
    __features = 0

    def __data_trasform(self,X,y=[]):
        xt=X.to_numpy()
        xt=np.insert( xt, 0, np.ones(xt.shape[0]), axis=1)
        if(len(y) !=0):
            yt=y.to_numpy()
            return xt,yt
        else:
            return xt

    def __fit(self,X, y):
        self.__X, self.__y = self.__data_trasform(X,y)
        self.__features=self.__X.shape[1]
        self._coef=np.zeros(self.__features)
        
    def __calc_y(self, x=[]):
        result=np.array([])
        if(len(x)==0):
            x=self.__X
        else:
            x=self.__data_trasform(x)
        for r in range(x.shape[0]):
            sign=np.multiply(x[r],self._coef)
            sign=sign = 1/(1+math.exp(-sum(sign)))
            result=np.append(result,sign)
        return result
    
    def __calc_cost(self):
        h = self.__calc_y()
        cost = self.__y*np.log(h) + (1-self.__y)*np.log(1-h)
        cost=sum(cost)/self.__X.shape[0]*-1
        return cost
    
    def __calc_gradient(self):
        h=self.__calc_y()
        y=self.__y
        rows=self.__X.shape[0]
        gradient=np.array([])
        for f in range(self.__features):
            x=self.__X.T[f]
            gradient=np.append(gradient, sum((h-y)*x)/rows )
        return gradient
    
    def __gradient_descent(self):
        for _ in range(self._epoches):
            self._coef -= self.__calc_gradient()*self._lr
            if (self.__calc_cost() < .1):
                break
    
    def __decision_boundary(self, prob):
        return 1 if prob >= .5 else 0
    
    def __nesterov_momentum(self):
        coef = self._coef
        mu=np.full((self.__features),0.9)
        v=np.full((self.__features),0)
        for _ in range(self._epoches):
            self._coef= self._coef + (mu * v )
            v = mu * v - self.__calc_gradient()*self._lr
            self._coef = coef + v
            if (self.__calc_cost() < .1):
                break

    def __RMSprop(self):
      cache=0
      decay_rate=.99
      eps = 0.000001
      for _ in range(self._epoches):
        cache = decay_rate * cache + (1 - decay_rate) *  self.__calc_gradient()**2
        self._coef += - self._lr *  self.__calc_gradient() / (np.sqrt(cache) + eps)
        if (self.__calc_cost() < .1):
          break
    
    def predict(self,X):
        y=self.__calc_y(X)
        y= y.tolist()
        for i in range(len(y)):
            y[i]=self.__decision_boundary(y[i])
        return y

        
    
    def train(self, X, y):
        self.__fit(X,y)
        if(self._method == 'gradient' ):
          self.__gradient_descent()
        elif(self._method == 'nesterov'):
          self.__nesterov_momentum()
        elif(self._method == 'RMSprop'):
          self.__RMSprop()
        else:
          "Установлен неправильны оптимизатор"

    def __init__(self,learnig_rate=0.01 , epoches = 1000, method='gradient'):
        self._lr = learnig_rate
        self._epoches = epoches
        self._method=method

## 4. Оценка модели с градиентным спуском

In [617]:
gradient_model=LogisticRegression(0.05,100,'gradient')
gradient_model.train(X_train,y_train)
y_pred=gradient_model.predict(X_test)
print('F1 score on testing data: {:.2f}%'.format(100*f1_score(y_test, y_pred)))

F1 score on testing data: 93.33%


##  5. Оценка модели с nesterov momentum

In [628]:
nesterov_model=LogisticRegression(0.015,100,'nesterov')
nesterov_model.train(X_train,y_train)
y_pred=nesterov_model.predict(X_test)
print('F1 score on testing data: {:.2f}%'.format(100*f1_score(y_test, y_pred)))

F1 score on testing data: 93.33%


## 6. Оценка модели с RMSprop

In [743]:
rms_model=LogisticRegression(0.001,100,'RMSprop')
rms_model.train(X_train,y_train)
y_pred=rms_model.predict(X_test)
print('F1 score on testing data: {:.2f}%'.format(100*f1_score(y_test, y_pred)))

F1 score on testing data: 94.12%


## 7. Проверка лучших параметров модели на всем датасете

In [744]:
result=copy.deepcopy(X)
result['Species']=y
result['predict']=rms_model.predict(X)
print('F1 score on testing data: {:.2f}%'.format(100*f1_score(y, rms_model.predict(X))))

F1 score on testing data: 85.71%


In [714]:
result['Species']=le.inverse_transform(result['Species'])
result['predict']=le.inverse_transform(result['predict'])

In [715]:
result.head(100)


Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,predict
50,1.119009,0.990688,-0.250779,-0.653039,versicolor,versicolor
51,0.209246,0.990688,-0.494254,-0.416431,versicolor,versicolor
52,0.967382,0.688649,-0.007304,-0.416431,versicolor,versicolor
53,-1.155400,-1.727663,-1.102941,-0.889647,versicolor,versicolor
54,0.360873,-0.217468,-0.372516,-0.416431,versicolor,versicolor
...,...,...,...,...,...,...
145,0.664127,0.386610,0.357908,1.476436,virginica,virginica
146,0.057618,-1.123585,0.114433,0.530003,virginica,virginica
147,0.360873,0.386610,0.357908,0.766611,virginica,virginica
148,-0.094009,1.594766,0.601383,1.476436,virginica,virginica
