In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline
from sklearn.base import ClassifierMixin, BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

In [None]:
data = pd.read_csv('train.csv')
data.fillna(value=0,inplace=True)
columns_to_drop = ["Name"] #get rid of categorical values
data = data.drop(columns = columns_to_drop)
#data = pd.get_dummies(data)
data['PassengerId'] = data['PassengerId'].astype("string")
data['HomePlanet'] = data['HomePlanet'].astype("string")
data['CryoSleep'] = data['CryoSleep'].astype("string")
data['Cabin'] = data['Cabin'].astype("string")
data['VIP'] = data['VIP'].astype("string")
data['Destination'] = data['Destination'].astype("string")

train = data.sample(frac = 0.70)
test = data.drop(train.index)
train_xs = train.drop(columns = "Transported")
train_ys = train['Transported']
test_xs = test.drop(columns = "Transported")
test_ys = test['Transported']


#train_xs.dtypes


In [None]:
class myClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.train_xs = None
        self.train_ys = None
        self.class_count = {}
        self.alpha = 0.01
        self.D = 2
    def countClass(self):
        count = (self.train_ys['Transported'].value_counts())
        self.class_count = count
        #print(self.class_count[x])
    def p_conditional(self, feature, value, target):
        count = self.class_count[target]
        train = pd.concat([self.train_xs, self.train_ys], axis=1)

        count_feature_given_class = train[(train['Transported'] == target) & (train[feature] == value)].shape[0] 
        val = (count_feature_given_class + self.alpha) / (count + (self.alpha * self.D))
        return val

    def p_class(self, target):
        numerator = self.class_count[target]
        denominator = len(self.train_xs)
        return numerator / denominator
    def fit(self, X, y):
        train_xs = pd.DataFrame(X)
        train_ys = pd.DataFrame(y, columns = ['Transported'])
        self.train_xs = train_xs
        self.train_ys = train_ys
        #print(train_ys)
        self.countClass()
        

    def predict(self, X):
        predictions = []
        nb = 0
        test_xs = pd.DataFrame(X)
        for index, row in test_xs.iterrows():
            best_class = None
            best_probability = -math.inf
            denom = 0
            for target, count in self.class_count.items(): #for each weapon type, or class
                nb = math.log2(self.p_class(target))
                for feature in self.train_xs.columns:  #now for every feature, add up totals
                    value = row[feature]
                    nb += math.log2(self.p_conditional(feature,value,target))
                denom += nb
                nb = math.pow(2,nb)
                denom = math.pow(2,nb)
                nb = nb / denom
                #print(target)
                #print(nb)
                if nb > best_probability:
                    best_probability = nb
                    best_class = target            
            predictions.append(best_class)
        return np.array(predictions)
categorical_columns =['PassengerId','HomePlanet','Cabin','Destination','CryoSleep','VIP']

numerical_columns = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

scaler = MinMaxScaler()
classifier = myClassifier()



numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns),
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classify', classifier)
])


pipeline.fit(train_xs,train_ys)

In [None]:
#predicted_ys = pipeline.predict(test_xs)
#accuracy_score(test_ys,predicted_ys)
pipeline.score(test_xs,test_ys)