In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

import os, sys

In [2]:
cwd = os.getcwd()
train_data_path = os.path.join(cwd, 'data', 'train.csv')
test_data_path = os.path.join(cwd, 'data', 'test.csv')
sample_submission_path = os.path.join(cwd, 'data', 'sample_submission.csv')

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
sample_submission = pd.read_csv(sample_submission_path)

In [3]:
train_data.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_17,x_18,x_19,x_20,x_21,x_22,x_23,x_24,x_25,labels
0,-0.789364,-1.434296,0.324867,-1.08957,-3.186751,-1.915492,-1.985052,-5.109614,-1.776943,-10.228432,...,-1.14535,-1.157258,-4.935825,1.567903,1.691138,-2.914742,0.713525,1.703421,-0.222027,0.0
1,1.698561,-0.530175,0.756504,-0.849795,0.738859,4.32168,3.322877,-4.769473,-1.148654,-0.623213,...,1.077191,-3.360013,-7.324134,1.074675,-0.984185,-1.361525,2.444832,-1.497029,1.09692,0.0
2,2.480805,-2.933747,1.407295,-0.356059,1.179147,-4.181063,-4.177118,-2.854105,-0.22362,-2.034928,...,-3.219309,-0.026445,-8.659095,-0.152213,0.685907,0.442014,1.818607,2.793273,0.072918,2.0
3,1.457755,-0.106902,-0.852411,1.175998,6.619029,2.156072,-0.777952,5.03149,0.476906,3.009128,...,4.263037,-0.784668,5.11543,1.010681,-0.641215,-6.322318,-0.806044,0.69957,0.260674,2.0
4,1.427555,0.649282,0.254497,-1.064585,0.303576,-4.54524,4.577316,-5.233015,-1.007334,0.51103,...,3.363232,-1.767302,-3.108375,-1.916984,-0.423236,0.631079,-3.378547,-4.165684,0.967436,2.0


In [4]:
train_data.shape

(4800, 27)

In [5]:
test_data.head()

Unnamed: 0,ID,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,...,x_16,x_17,x_18,x_19,x_20,x_21,x_22,x_23,x_24,x_25
0,1,-0.230293,-3.466028,1.511166,0.740295,3.696918,-2.578689,2.263205,-0.126368,1.207075,...,-0.717038,-2.280132,-4.019121,4.175089,-2.939001,0.690836,-1.537785,0.523352,-0.287075,-0.033105
1,2,-0.58931,2.695952,-0.447133,1.742419,-3.912262,7.050236,-2.624268,2.29261,-0.640342,...,-3.313892,-7.084135,-0.161589,-3.913306,-3.592095,-2.974472,2.576795,-1.702104,2.209905,0.618079
2,3,2.070704,-1.921016,1.352349,1.948624,-1.549088,-0.623295,-0.013214,4.281549,-0.569961,...,1.295106,0.363587,2.20761,4.304411,-1.301508,-3.051108,1.138168,-2.822654,-1.628571,-0.441178
3,4,0.130017,2.2257,-0.504748,-0.401777,2.244243,4.770526,1.78905,-1.553924,1.11607,...,8.556711,-5.356854,2.574727,6.959246,0.220325,0.344151,3.047017,1.398412,-0.284969,-0.424696
4,5,-1.718615,0.253217,-0.539986,0.261817,0.246253,-0.502865,-1.19027,-1.416252,-1.735776,...,-4.632323,-1.219645,2.092873,-2.675771,-4.998719,1.639839,-1.2309,2.967112,0.752419,-0.589382


In [6]:
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_test = test_data.iloc[:,1:]
X_test.head()



Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_16,x_17,x_18,x_19,x_20,x_21,x_22,x_23,x_24,x_25
0,-0.230293,-3.466028,1.511166,0.740295,3.696918,-2.578689,2.263205,-0.126368,1.207075,3.171999,...,-0.717038,-2.280132,-4.019121,4.175089,-2.939001,0.690836,-1.537785,0.523352,-0.287075,-0.033105
1,-0.58931,2.695952,-0.447133,1.742419,-3.912262,7.050236,-2.624268,2.29261,-0.640342,-15.37838,...,-3.313892,-7.084135,-0.161589,-3.913306,-3.592095,-2.974472,2.576795,-1.702104,2.209905,0.618079
2,2.070704,-1.921016,1.352349,1.948624,-1.549088,-0.623295,-0.013214,4.281549,-0.569961,1.071801,...,1.295106,0.363587,2.20761,4.304411,-1.301508,-3.051108,1.138168,-2.822654,-1.628571,-0.441178
3,0.130017,2.2257,-0.504748,-0.401777,2.244243,4.770526,1.78905,-1.553924,1.11607,-5.73919,...,8.556711,-5.356854,2.574727,6.959246,0.220325,0.344151,3.047017,1.398412,-0.284969,-0.424696
4,-1.718615,0.253217,-0.539986,0.261817,0.246253,-0.502865,-1.19027,-1.416252,-1.735776,-7.30025,...,-4.632323,-1.219645,2.092873,-2.675771,-4.998719,1.639839,-1.2309,2.967112,0.752419,-0.589382


In [7]:
# Define a function to train the logistic regression model
def train_gaussian_nb(X_train, y_train):
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    return clf

def train_random_forest(X_train, y_train, n_j, pg):
    clf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(clf, pg, cv=3, scoring='accuracy', n_jobs=n_j)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    return best_params, best_score

param_grid = {
    'n_estimators': [100,200,300,400,500],
    'max_depth': [10,20,30],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4]
}

best_rf_param, best_rf_score = train_random_forest(X_train=X_train_scaled, y_train=y_train, n_j=4, pg=param_grid)
print(f"Best RF score for given dataset: {best_rf_score}")

Best RF score for given dataset: 0.8066666666666666


In [8]:
def logistic_regression_tuning(X, y):
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'solver': ['sag', 'saga', 'newton-cg', 'newton-cholesky'],
        'max_iter': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
        'C': [0.001, 0.01, 0.1, 1, 10]  # Inverse of regularization strength
    }

    # Create a Logistic Regression model
    log_reg = LogisticRegression(random_state=42, max_iter=1000)

    # Create a GridSearchCV object to perform hyperparameter tuning
    grid_search = GridSearchCV(log_reg, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

    # Fit the grid search to the data
    grid_search.fit(X, y)

    # Get the best hyperparameters
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    return best_params, best_score

# Example usage:
best_lr_params, best_lr_score = logistic_regression_tuning(X_train_scaled, y_train)
print(f"Best Logistic Regression Score: {best_lr_score}")


Best Logistic Regression Score: 0.595


In [9]:
def svc_tuning(X, y):
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'C': [0.1, 1, 10],  # Regularization parameter
        'kernel': ['linear', 'rbf', 'poly'],  # Kernel type
        'degree': [2, 3, 4],  # Degree of the polynomial kernel
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1]  # Kernel coefficient for 'rbf' and 'poly'
    }

    # Create an SVC model
    svc_classifier = SVC()

    # Create a GridSearchCV object to perform hyperparameter tuning
    grid_search = GridSearchCV(svc_classifier, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

    # Fit the grid search to the data
    grid_search.fit(X, y)

    # Get the best hyperparameters
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    return best_params, best_score

# Example usage:
best_svc_params, best_svc_score = svc_tuning(X_train_scaled, y_train)
print(f"Best SVC score: {best_svc_score}")

Best SVC score: 0.8860416666666667


In [10]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

gnb = train_gaussian_nb(X_train_scaled, y_train)

# Calculate cross-validated scores for Gaussian Naive Bayes
gnb_scores = cross_val_score(gnb, X_train_scaled, y_train, cv=kf, scoring='accuracy')

# Standardize the test features
X_test_scaled = scaler.transform(X_test)

# Make predictions using the Gaussian Naive Bayes model
gnb_predictions = gnb.predict(X_test_scaled)

print("Mean accuracy of Gaussian Naive Bayes:", gnb_scores.mean())

# Create other models for ensemble
logistic_regression = LogisticRegression(penalty='l2', **best_lr_params, n_jobs=-1)
svc = SVC(probability=True)
random_forest = RandomForestClassifier(n_jobs=-1, class_weight='balanced', **best_rf_param)

# Build a Voting Classifier with the models
# ensemble_model = VotingClassifier(estimators=[
    # ('GaussianNB', gnb),
    # ('LogisticRegression', logistic_regression),
    # ('SVC', svc),
    # ('RandomForest', random_forest)
# ], voting='soft')  # 'soft' for probability-based voting

stack_model = StackingClassifier(
    estimators=[
        ('GaussianNB', gnb),
        ('LogisticRegression', logistic_regression),
        ('SVC', svc),
        ('RandomForest', random_forest)
    ],
    final_estimator=random_forest
)

ensemble_scores = cross_val_score(stack_model, X_train_scaled, y_train, cv=kf, scoring='accuracy', n_jobs=-1)

print("Mean accuracy of the ensemble model:", ensemble_scores.mean())

# Fit the ensemble model
# ensemble_model.fit(X_train_scaled, y_train)
stack_model.fit(X_train_scaled, y_train)

# Make predictions using the ensemble model
# ensemble_predictions = ensemble_model.predict(X_test_scaled)
stack_predictions = stack_model.predict(X_test_scaled)

# Update the predicted labels in the sample submission file for both models
sample_submission['label'] = gnb_predictions

# Save the modified sample submission file for the Gaussian Naive Bayes model
sample_submission.to_csv(os.path.join('output', 'gnb_sample_submission.csv'), index=False)

# Update the predicted labels in the sample submission file for the ensemble model
sample_submission['label'] = stack_predictions# ensemble_predictions

# Save the modified sample submission file for the ensemble model
sample_submission.to_csv(os.path.join('output', 'stack_sample_submission.csv'), index=False)



Mean accuracy of Gaussian Naive Bayes: 0.5897916666666667
Mean accuracy of the ensemble model: 0.8845833333333333
