In [109]:
from copy import copy
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from abc import ABCMeta, abstractmethod

%matplotlib inline
import seaborn as sns

In [2]:
df_train = pd.read_csv('dataset_train.csv', index_col='Index')

##### transoform birthdate and add column year (year of study)

In [9]:
df_train['year'] = df_train['Birthday'].apply(lambda x: int(x.split('-')[0]))
df_train['year'] = df_train['year'] - df_train['year'].min()
df_train.head()

Unnamed: 0_level_0,Hogwarts House,First Name,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying,year
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,Ravenclaw,Tamara,Hsu,2000-03-30,Left,58384.0,-487.886086,5.72718,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89,4
1,Slytherin,Erich,Paredes,1999-10-14,Right,67239.0,-552.060507,-5.987446,5.520605,-5.612,-487.340557,367.760303,4.10717,1058.944592,7.248742,0.091674,-252.18425,-113.45,3
2,Ravenclaw,Stephany,Braun,1999-11-03,Left,23702.0,-366.076117,7.725017,3.660761,6.14,664.893521,602.585284,3.555579,1088.088348,8.728531,-0.515327,-227.34265,30.42,3
3,Gryffindor,Vesta,Mcmichael,2000-08-19,Left,32667.0,697.742809,-6.497214,-6.977428,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-0.01404,-256.84675,200.64,4
4,Gryffindor,Gaston,Gibbs,1998-09-27,Left,60158.0,436.775204,-7.820623,,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-0.26407,-256.3873,157.98,2


###### FrameHandler class has functionality of dataframe data preparation

In [218]:
class FrameHandler:
    
    @classmethod
    def cut_features(cls, df: pd.DataFrame, features=[]) -> pd.DataFrame:
        return df.drop(columns=features, inplace=False)
    
    @classmethod
    def normalize_data(cls, df: pd.DataFrame, columns=[]) -> pd.DataFrame:
        """
        Normalize all columns data if colums argument doesn't set.
        Otherwice - normizes only defined columns.
        """
        normalized_df = df.copy()    
        if columns:
            normalized_df[columns] = cls.__normalize(normalized_df[columns])
        else:
            normalized_df = cls.__normalize(normalized_df)
        return normalized_df
    
    @classmethod
    def __normalize(cls, df):
        return (df - df.min()) / (df.max() - df.min())
    
    @classmethod
    def filter_numeric(cls, df: pd.DataFrame) -> pd.DataFrame:
        return df._get_numeric_data()
    
    @classmethod
    def prepend_ones(cls, df: pd.DataFrame, column_name: str='bias') -> pd.DataFrame:
        rows, _ = df.shape
        ones = np.ones(rows)
        bias_df = pd.DataFrame({column_name: ones}, index=df.index)
        concatenated_df = pd.concat([bias_df, df], axis=1)
        return concatenated_df
    

In [219]:
def prepare_dataframe(df: pd.DataFrame, drop_features=[], normalize=True) -> pd.DataFrame:
    """
    Prepare dataframe for model:
        - drop features were received
        - create dummies
        - filter only numeric values
        - clean datafrate from NaN values
        - normalize data if "normalize" is True
    """
    prepared_df = FrameHandler.cut_features(df_train, drop_features)
    
    # create dummie variables for hands and houses
    hand_dummies_df = pd.get_dummies(prepared_df['Best Hand'])
    house_dummies_df = pd.get_dummies(prepared_df['Hogwarts House'])
    prepared_df = pd.concat([prepared_df, hand_dummies_df, house_dummies_df], axis=1)
    
    # filter only numeric columns and drop NaN
    prepared_df = FrameHandler.filter_numeric(prepared_df)
    prepared_df.dropna(how='any', inplace=True)

    # normalize data
    if normalize:
        prepared_df = FrameHandler.normalize_data(prepared_df)
    
    prepared_df = FrameHandler.prepend_ones(prepared_df)
    return prepared_df


In [220]:
# Main functionaliity
UNNESSESARY_FEATURES = ['Defense Against the Dark Arts', 'Care of Magical Creatures', 'Arithmancy', 'year']


# import data
df_train = pd.read_csv('dataset_train.csv', index_col='Index')
# craete year columns that shouws us cources of students
df_train['year'] = df_train['Birthday'].apply(lambda x: int(x.split('-')[0]))
df_train['year'] = df_train['year'] - df_train['year'].min()

prepared_df = prepare_dataframe(df=df_train, drop_features=UNNESSESARY_FEATURES)
prepared_df.head(12)
# prepared_df.shape

Unnamed: 0_level_0,bias,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,Left,Right,Gryffindor,Hufflepuff,Ravenclaw,Slytherin
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1.0,0.241486,0.778094,0.716936,0.616003,0.538679,0.672324,0.708932,0.431946,0.793213,0.335649,1.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.209123,0.209214,0.166054,0.260548,0.181768,0.617016,0.810917,0.633512,0.248862,0.147696,0.0,1.0,0.0,0.0,0.0,1.0
2,1.0,0.302914,0.875112,0.792526,0.799895,0.690568,0.589872,0.966075,0.71976,0.946253,0.46009,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.839396,0.184459,0.679834,0.237302,0.520257,0.178215,0.073278,0.258934,0.11797,0.8297,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.707791,0.120192,0.584413,0.280712,0.683503,0.2454,0.164015,0.462294,0.130868,0.73707,1.0,0.0,1.0,0.0,0.0,0.0
5,1.0,0.178044,0.291683,0.113812,0.28224,0.243393,0.679667,0.778445,0.895932,0.367859,0.318713,0.0,1.0,0.0,0.0,0.0,1.0
8,1.0,0.737857,0.246682,0.777334,0.195384,0.61529,0.10986,0.099174,0.274357,0.246298,0.924089,1.0,0.0,1.0,0.0,0.0,0.0
9,1.0,0.753389,0.884688,0.644117,0.302319,0.124816,0.659901,0.71759,0.331609,0.462473,0.393842,0.0,1.0,0.0,1.0,0.0,0.0
10,1.0,0.792593,0.766294,0.75084,0.240207,0.435518,0.695384,0.690283,0.752667,0.515352,0.534329,0.0,1.0,0.0,1.0,0.0,0.0
11,1.0,0.708689,0.758346,0.625886,0.245934,0.288595,0.651182,0.746059,0.582702,0.452979,0.438898,1.0,0.0,0.0,1.0,0.0,0.0


##### Logistic regression class

In [352]:
class LogisticRegression(metaclass=ABCMeta):
    
    def __init__(self, df: pd.DataFrame=None):
        self.frame = df
        
    @property
    def frame(self):
        if self.__frame is None:
            return NotImplemented
        return self.__frame
    
    @frame.setter
    def frame(self, df: pd.DataFrame):
        if df is not None and not isinstance(df, pd.DataFrame):
            raise Exception('Wrong format. Should be pd.DataFrame object')
        self.__frame = None
        if df is not None:
            self.__frame = df.copy()
    
    @property
    def y(self):
        if self.__y is None:
            return NotImplemented
        return self.__y
    
    @y.setter
    def y(self, y):
        self.__y = y
        
    @property
    def X(self):
        if self.__X is None:
            return NotImplemented
        return self.__X
    
    @X.setter
    def X(self, X):
        if X is not None and not isinstance(X, pd.DataFrame):
            raise Exception('Wrong format. Should be pd.DataFrame object')
        self.__X = X
        
    @property
    def theta(self):
        if self.__theta is None:
            return NotImplemented
        return self.__theta
        
    @theta.setter
    def theta(self, theta):
        if theta is not None and not isinstance(theta, pd.DataFrame):
            raise Exception('Wrong format. Should be pd.DataFrame object')
        self.__theta = theta

    def set_target_column(self, column_name: str):
        """
        According target column and set X, y, theta values.
        """
        columns = list(self.frame.columns.values)
        if column_name not in columns:
            raise Exception(f'No <{column_name}> column in dataframe')
        indx = columns.index(column_name)
        columns.pop(indx)
        self.y = pd.DataFrame(self.frame[column_name])
        self.X = pd.DataFrame(self.frame[columns])
        
        theta_names = [f'theta_{i}' for i in range(len(columns))]
        theta_shape = (1, len(theta_names))
        self.theta = pd.DataFrame(data=np.zeros(theta_shape), columns=theta_names)

    @staticmethod
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))
    
#     def gradient_step(self, learning_rate: float, loss: np.ndarray, rows: int, X: np.ndarray) -> np.ndarray:
#         s = X.T.dot(loss)
#         delta_W = 2 * (learning_rate * s / rows)
#         return self.W - delta_W
    
    def cost_gradient(theta, X, y):
        predictions = self.sigmoid(X @ theta)
        return X.T @ (predictions - y) / len(y)
    
    # TODO: Investigate why predicted values set to NaN
    def cost(self, theta, X, y):
        predictions = self.sigmoid(X @ theta)
        predictions[predictions == 1] = 0.999
        print(predictions[predictions != 0.5])
#         print(type(-y.values[0][0]))
#         print(np.log(predictions).values[0][0])
        error = -y * np.log(predictions) - (1 - y) * np.log(1 - predictions)
        return sum(error) / len(y)
    
    @staticmethod
    @abstractmethod
    def gradient_algorithm(cost: callable, initial_theta, cost_gradient: callable, X, y):
        """
        Minimize a function using a gradient algorithm.
        return: Vector of result weights for the model 
        """
        # TODO: Gradient algorithm should be implemented
        pass

    

In [353]:
class LogisticRegressionBinary(LogisticRegression):
    
    @staticmethod
    def gradient_algorithm(cost: callable, initial_theta, cost_gradient, X, y):
        """
        Minimize a function using a gradient algorithm.
        return: Vector of result weights for the model 
        """
        prev_cost = cost(initial_theta, X, y)
        prev_theta = initial_theta
        while True:
            theta = prev_theta - 0.01 * cost_gradient(theta, X, y)
            current_cost = cost(theta, X, y)
            
            print('cost_diff=', abs(current_cost - prev_cost))
            if current_cost > prev_cost:
                print('current_cost > prev_cost')
                break
            # add conditions if nessesary
            prev_cost = current_cost
            prev_theta = theta

###### test for binary regression

In [354]:
binary_df = FrameHandler.cut_features(prepared_df, ['Hufflepuff', 'Ravenclaw', 'Slytherin'])
model = LogisticRegressionBinary(binary_df)
model.set_target_column('Gryffindor')
model.frame.head(12)
model.theta
# model.X.dot(model.theta.values.T)
# model.X.dot(model.theta.T)
# print(model.X.shape)
# print(model.theta.T.shape)
LogisticRegressionBinary().gradient_algorithm(model.cost, model.theta.values.T, model.cost_gradient, model.X, model.y)

        0
Index    
0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
8     NaN
9     NaN
10    NaN
11    NaN
12    NaN
13    NaN
14    NaN
15    NaN
16    NaN
17    NaN
18    NaN
19    NaN
20    NaN
22    NaN
23    NaN
24    NaN
25    NaN
26    NaN
27    NaN
28    NaN
29    NaN
30    NaN
31    NaN
32    NaN
...    ..
1563  NaN
1564  NaN
1566  NaN
1567  NaN
1568  NaN
1571  NaN
1572  NaN
1573  NaN
1574  NaN
1576  NaN
1577  NaN
1579  NaN
1581  NaN
1582  NaN
1583  NaN
1584  NaN
1585  NaN
1586  NaN
1587  NaN
1589  NaN
1590  NaN
1591  NaN
1592  NaN
1593  NaN
1594  NaN
1595  NaN
1596  NaN
1597  NaN
1598  NaN
1599  NaN

[1333 rows x 1 columns]


TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [294]:
- 0.0 * -0.023232


0.0

In [None]:
(1333, 13) * (1, 13)

In [67]:
from sklearn.linear_model import LogisticRegression

df_ = df_train.dropna(how='any', inplace=False)
model = LogisticRegression()
model.fit(df_.drop(columns='Defense Against the Dark Arts')._get_numeric_data(), df_['house_int'])



ModuleNotFoundError: No module named 'sklearn'

In [None]:
model.score(df_.drop(columns='Defense Against the Dark Arts')._get_numeric_data(), df_['house_int'])