In [1]:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import math
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


col_names = ["ID", "Gender", "Age", "Height", "Weight", "Family History", "FAVC", "FCVC",
             "NCP", "CAEC", "SMOKE", "CH2O", "SCC", "FAF", "TUE", "CALC", "MTRANS", "Verdict"]


class data:
    train_df = None
    test_df = None
    trainX = None
    trainY = None
    LE = LabelEncoder()

    def __init__(self):
        self.train_df = pd.read_csv(
            "/kaggle/input/playground-series-s4e2/train.csv", names=col_names, header=0)
        self.test_df = pd.read_csv(
            "/kaggle/input/playground-series-s4e2/test.csv", names=col_names[:-1], header=0)
        self.trainY = self.train_df["Verdict"]
        self.trainX = self.train_df.drop("Verdict", axis=1)

    def encode(self):
        self.trainX["Gender"] = self.LE.fit_transform(self.trainX["Gender"])
        self.trainX["Family History"] = self.LE.fit_transform(
            self.trainX["Family History"])
        self.trainX["FAVC"] = self.LE.fit_transform(self.trainX["FAVC"])
        self.trainX["CAEC"] = self.LE.fit_transform(self.trainX["CAEC"])
        self.trainX["SMOKE"] = self.LE.fit_transform(self.trainX["SMOKE"])
        self.trainX["SCC"] = self.LE.fit_transform(self.trainX["SCC"])
        self.trainX["CALC"] = self.LE.fit_transform(self.trainX["CALC"])
        self.trainX["MTRANS"] = self.LE.fit_transform(self.trainX["MTRANS"])
        self.test_df["Gender"] = self.LE.fit_transform(self.test_df["Gender"])
        self.test_df["Family History"] = self.LE.fit_transform(
            self.test_df["Family History"])
        self.test_df["FAVC"] = self.LE.fit_transform(self.test_df["FAVC"])
        self.test_df["CAEC"] = self.LE.fit_transform(self.test_df["CAEC"])
        self.test_df["SMOKE"] = self.LE.fit_transform(self.test_df["SMOKE"])
        self.test_df["SCC"] = self.LE.fit_transform(self.test_df["SCC"])
        self.test_df["CALC"] = self.LE.fit_transform(self.test_df["CALC"])
        self.test_df["MTRANS"] = self.LE.fit_transform(self.test_df["MTRANS"])
        self.trainY = self.LE.fit_transform(self.trainY)
        
    def decode(self, y):
        return self.LE.inverse_transform(y)

    def scaling(self):
        scaler = StandardScaler()
        self.trainX = scaler.fit_transform(self.trainX)
        self.test_df = scaler.fit_transform(self.test_df)
train = data()
train.encode()
train.scaling()

KNN_PARAMS = {
    "n_neighbors": [3, 5, 7, 9, 11],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": [10, 20, 30, 40],
    "n_jobs": [-1],
    'metric': ['euclidean', 'manhattan', 'l1', 'l2']
}

SVM_PARAMS = {'C': [0.1, 1, 10, 100], 'gamma': [
    1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'sigmoid']}
BAYES_PARAMS = {}
LOGIT_PARAMS = {'C': [0.1, 1, 10, 100], 'solver': [
    'lbfgs', 'liblinear'], 'max_iter': [100, 200, 300]}
FOREST_PARAMS = {'n_estimators': [100, 200, 300], 'max_depth': [
    2, 5, 10, 15], 'min_samples_split': [2, 5, 10, 15], 'min_samples_leaf': [1, 2, 5, 10]}
KNN = KNeighborsClassifier()
SVM = SVC()
BAYES = GaussianNB()
LOGIT = LogisticRegression()
FOREST = RandomForestClassifier()
MODELS = [KNN, SVM, BAYES, LOGIT, FOREST]
MODEL_NAMES = ['K Nearest Neighbors', 'Support Vector Machine',
               'Naive Bayes', 'Logistic Regression', 'Random Forest']
PARAMS = [KNN_PARAMS, SVM_PARAMS, BAYES_PARAMS, LOGIT_PARAMS, FOREST_PARAMS]
BEST_SCORES = 0
BEST_MODEL = None

# for i in range(len(MODELS)):
#     grid = GridSearchCV(MODELS[i], PARAMS[i], cv=StratifiedKFold(
#         n_splits=5, shuffle=True, random_state=42), scoring='accuracy', n_jobs=-1)
#     grid.fit(train.trainX, train.trainY)
#     print(MODEL_NAMES[i])
#     print('Best Score: {:.2f}'.format(grid.best_score_))
#     print('Best Parameters: {}'.format(grid.best_params_))
#     print('Best Estimator: {}'.format(grid.best_estimator_))
#     print('-'*50)
#     if grid.best_score_ > BEST_SCORES:
#         BEST_SCORES = grid.best_score_
#         BEST_MODEL = grid.best_estimator_

BEST_MODEL = RandomForestClassifier(max_depth=20, n_estimators=500, min_samples_leaf=1, min_samples_split=5)    
BEST_MODEL.fit(train.trainX,train.trainY)
# Predictions
y_pred = BEST_MODEL.predict(train.test_df)
y_pred = train.decode(y_pred)
# Write to file
start_index = 20758
file = open('/kaggle/working/predictions.csv', 'w')
file.write('id,Obeyesdad\n')
for i in range(len(y_pred)):
    file.write(str(start_index+i)+','+str(y_pred[i])+'\n')
file.close()