In [1]:
# Import pandas to create DataFrame 
import pandas as pd

#Ignoring warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Make DataFrame of the given data 
train = pd.read_csv('train.csv')
valid = pd.read_csv('valid.csv')

In [4]:
train.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_763,feature_764,feature_765,feature_766,feature_767,feature_768,label_1,label_2,label_3,label_4
0,0.031138,0.079892,0.157382,-0.014636,-0.051778,-0.021332,-0.073593,-0.005386,-0.212557,0.099683,...,-0.085248,-0.096007,-0.000766,0.021399,-0.041432,0.094806,45,,1,6
1,0.11304,0.175731,0.217741,-0.196254,-0.010129,-0.030586,0.067114,-0.072412,-0.239192,0.104741,...,-0.090283,-0.053885,-0.010967,0.062209,-0.122958,0.192949,45,,1,6
2,0.04857,0.091281,0.160776,-0.150937,0.020115,0.044117,-0.050092,-0.045661,-0.155332,0.117206,...,-0.021524,-0.008411,-0.006248,0.031468,-0.056915,0.154731,45,,1,6
3,0.039212,0.118388,0.173831,-0.096659,-0.008702,0.061298,0.008974,-0.003277,-0.065046,0.09548,...,-0.071936,-0.02312,-0.007812,0.0576,-0.121892,0.072796,45,,1,6
4,0.056019,0.170639,0.157917,-0.228605,-0.065965,-0.088732,-0.082243,-0.080568,-0.3415,0.14243,...,-0.155621,-0.079447,0.015316,0.127726,-0.151966,0.169634,45,,1,6


In [5]:
#Checking provided data
train.isnull().sum()

feature_1        0
feature_2        0
feature_3        0
feature_4        0
feature_5        0
              ... 
feature_768      0
label_1          0
label_2        480
label_3          0
label_4          0
Length: 772, dtype: int64

In [3]:
# splitting dependent & Independent features
y = train[["label_1","label_2","label_3","label_4"]]
y_valid = valid[["label_1","label_2","label_3","label_4"]] 
x = train.drop(y, axis=1)
x_valid = valid.drop(y_valid, axis=1)

# Model Checking & Hyper parameter tuning

For KNN, I used Grid Search to do hyper parameter tuning

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import accuracy_score

def try_knn(train_x, train_y, vaid_x, valid_y):
    grid_params = { 'n_neighbors' : [3,5,7,15],
               'weights' : ['uniform','distance']}
    gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)
    g_res = gs.fit(train_x, train_y)
    print(f"Best Hyperparameters:",gs.best_params_)
    # Get the best k-NN model with the optimal hyperparameters
    best_knn = gs.best_estimator_
    # Evaluate the best model on the test data
    accuracy = best_knn.score(vaid_x, valid_y)
    print(f"Accuracy for KNN {accuracy}")      
        


Due to performance restriction of my machine, I do hyperparameter tuning manually for SVM.

In [5]:
from sklearn import svm

def try_svm(train_x, train_y, vaid_x, valid_y):
    clf = svm.SVC()
    clf.fit(train_x, train_y)
    X_valid_contiguous = np.ascontiguousarray(vaid_x)
    y_pred = clf.predict(X_valid_contiguous)
    print(f"Accuracy Score of SVM with default settings (RBF - exponential kernal) ", accuracy_score(valid_y, y_pred))
    
    clf = svm.LinearSVC(dual="auto")
    clf.fit(train_x, train_y)
    X_valid_contiguous = np.ascontiguousarray(vaid_x)
    y_pred = clf.predict(X_valid_contiguous)
    print(f"Accuracy Score of Linear SVM (one-vs-the-rest) ", accuracy_score(valid_y, y_pred))
    
    clf = svm.SVC(kernel='linear')
    clf.fit(train_x, train_y)
    X_valid_contiguous = np.ascontiguousarray(vaid_x)
    y_pred = clf.predict(X_valid_contiguous)
    print(f"Accuracy Score of SVM with linear kernal function ", accuracy_score(valid_y, y_pred))
    
    clf = svm.SVC(kernel='sigmoid')
    clf.fit(train_x, train_y)
    X_valid_contiguous = np.ascontiguousarray(vaid_x)
    y_pred = clf.predict(X_valid_contiguous)
    print(f"Accuracy Score of SVM with sigmoid kernal function ", accuracy_score(valid_y, y_pred))
    
    clf = svm.SVC(kernel='poly',degree =2)
    clf.fit(train_x, train_y)
    X_valid_contiguous = np.ascontiguousarray(vaid_x)
    y_pred = clf.predict(X_valid_contiguous)
    print(f"Accuracy Score of SVM with polynomial kernal function with degree 2 ", accuracy_score(valid_y, y_pred))
    
    clf = svm.SVC(kernel='poly',degree =2)
    clf.fit(train_x, train_y)
    X_valid_contiguous = np.ascontiguousarray(vaid_x)
    y_pred = clf.predict(X_valid_contiguous)
    print(f"Accuracy Score of SVM with polynomial kernal function with degree 3 ", accuracy_score(valid_y, y_pred))  

Logistic Regression Model
Checking Hyper Parameter tuning for Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

def try_logistic(train_x, train_y, vaid_x, valid_y):
    for i in ["lbfgs","newton-cg","sag","saga"]:
        logistic_regression = LogisticRegression(multi_class='multinomial', solver='lbfgs')
        logistic_regression.fit(train_x,train_y)
        y_pred = logistic_regression.predict(vaid_x)
        print(f"Accuracy Score of Logistic Regression with solver {i}", accuracy_score(valid_y, y_pred))

Naive Bayes 
Trying out different distributions

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import BernoulliNB

def try_naive(train_x, train_y, vaid_x, valid_y):
    
    min_value = np.min(train_x)
    scaled_train_x = train_x - min_value
    scaled_vaid_x = vaid_x - min_value
    
    clf = GaussianNB()
    clf.fit(scaled_train_x, train_y)
    y_pred = clf.predict(scaled_vaid_x)
    print(f"Accuracy Score of Naive Bayes with Gaussian Naive Bayes", accuracy_score(valid_y, y_pred))
    
    clf = MultinomialNB()
    clf.fit(scaled_train_x, train_y)
    y_pred = clf.predict(vaid_x)
    print(f"Accuracy Score of Naive Bayes with MultinomialNB", accuracy_score(valid_y, y_pred))
    
    clf = ComplementNB()
    clf.fit(scaled_train_x, train_y)
    y_pred = clf.predict(scaled_vaid_x)
    print(f"Accuracy Score of Naive Bayes with ComplementNB", accuracy_score(valid_y, y_pred))
    
    clf = CategoricalNB()
    clf.fit(scaled_train_x, train_y)
    y_pred = clf.predict(scaled_vaid_x)
    print(f"Accuracy Score of Naive Bayes with CategoricalNB", accuracy_score(valid_y, y_pred))
    
    clf = BernoulliNB()
    clf.fit(scaled_train_x, train_y)
    y_pred = clf.predict(scaled_vaid_x)
    print(f"Accuracy Score of Naive Bayes with BernoulliNB", accuracy_score(valid_y, y_pred))  
    



DecisionTree Classifier

In [8]:
from sklearn.tree import DecisionTreeClassifier

def try_decisionTree(train_x, train_y, vaid_x, valid_y):
    # Initialize and train the Decision Tree classifier
    clf = DecisionTreeClassifier()
    clf.fit(train_x, train_y)
    y_pred = clf.predict(vaid_x)
    print(f"Accuracy Score of Decision Tree", accuracy_score(valid_y, y_pred))

Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

def try_random_forest(train_x, train_y, vaid_x, valid_y):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(train_x, train_y)
    y_pred = clf.predict(vaid_x)
    print(f"Accuracy Score of Random Forest", accuracy_score(valid_y, y_pred))

XG Boost

In [184]:
#installing XG Boost
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.0-py3-none-win_amd64.whl (99.7 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.0
Note: you may need to restart the kernel to use updated packages.




In [10]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder


def try_xgBoost(train_x, train_y, vaid_x, valid_y):
    # Create an XGBoost classifier for multi-class classification
    clf = xgb.XGBClassifier(
    objective='multi:softmax',  # Set the objective for multi-class classification
    num_class=len(np.unique(train_y)),  # Number of classes
    random_state=42
    )
    le = LabelEncoder()
    train_y = le.fit_transform(train_y)
    clf.fit(train_x, train_y)
    y_pred = clf.predict(vaid_x)
    y_pred = le.inverse_transform(y_pred)
    print(f"Accuracy Score of XG Boost", accuracy_score(valid_y, y_pred))

# Feature Engineering Using Mutual Info Classification

In [36]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

def select_cols_using_mutual_info_regression(x,y,n) :
    selected_columns = SelectKBest(mutual_info_classif, k=n)
    selected_columns.fit(x, y)
    return x.columns[selected_columns.get_support()]

In [53]:
selected_cols = select_cols_using_mutual_info_regression(x,y['label_1'],300)

In [54]:
#After reducing rows, I got poor validation accuracy, So I decided to not to use this method.
#x = x[selected_cols]
#x_valid = x_valid[selected_cols]

# Label 1

In [13]:
#Doing Hyper parameter tuning for KNN & checking the best accuracy for label 1
try_knn(x,y["label_1"],x_valid,y_valid["label_1"])

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Hyperparameters: {'n_neighbors': 5, 'weights': 'distance'}
Accuracy for KNN 0.764


In [14]:
#Checking SVM Values for Label 1
try_svm(x,y["label_1"],x_valid,y_valid["label_1"])

Accuracy Score of SVM with default settings (RBF - exponential kernal)  0.7733333333333333
Accuracy Score of Linear SVM (one-vs-the-rest)  0.9453333333333334
Accuracy Score of SVM with linear kernal function  0.8946666666666667
Accuracy Score of SVM with sigmoid kernal function  0.18533333333333332
Accuracy Score of SVM with polynomial kernal function with degree 2  0.696
Accuracy Score of SVM with polynomial kernal function with degree 3  0.696


In [15]:
#Checking Logistic Regression Values for Label1. It provided very good results. 
try_logistic(x,y["label_1"],x_valid,y_valid["label_1"])

Accuracy Score of Logistic Regression with solver lbfgs 0.8413333333333334
Accuracy Score of Logistic Regression with solver newton-cg 0.8413333333333334
Accuracy Score of Logistic Regression with solver sag 0.8413333333333334
Accuracy Score of Logistic Regression with solver saga 0.8413333333333334


In [16]:
#It gave bad results
try_naive(x,y["label_1"],x_valid,y_valid["label_1"])

Accuracy Score of Naive Bayes with Gaussian Naive Bayes 0.136
Accuracy Score of Naive Bayes with MultinomialNB 0.016
Accuracy Score of Naive Bayes with ComplementNB 0.1
Accuracy Score of Naive Bayes with CategoricalNB 0.03866666666666667
Accuracy Score of Naive Bayes with BernoulliNB 0.013333333333333334


In [11]:
try_decisionTree(x,y["label_1"],x_valid,y_valid["label_1"])

Accuracy Score of Decision Tree 0.29733333333333334


In [12]:
try_random_forest(x,y["label_1"],x_valid,y_valid["label_1"])

Accuracy Score of Random Forest 0.76


In [13]:
try_xgBoost(x,y["label_1"],x_valid,y_valid["label_1"])

Accuracy Score of XG Boost 0.832


When considering validation accuracy results of label 1, Linear SVM overperforms all. So it is better to use that.

# Label 3

In [17]:
try_knn(x,y["label_3"],x_valid,y_valid["label_3"])

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Hyperparameters: {'n_neighbors': 7, 'weights': 'distance'}
Accuracy for KNN 0.984


In [18]:
try_logistic(x,y["label_3"],x_valid,y_valid["label_3"])

Accuracy Score of Logistic Regression with solver lbfgs 0.9933333333333333
Accuracy Score of Logistic Regression with solver newton-cg 0.9933333333333333
Accuracy Score of Logistic Regression with solver sag 0.9933333333333333
Accuracy Score of Logistic Regression with solver saga 0.9933333333333333


In [19]:
try_svm(x,y["label_3"],x_valid,y_valid["label_3"])

Accuracy Score of SVM with default settings (RBF - exponential kernal)  0.992
Accuracy Score of Linear SVM (one-vs-the-rest)  0.9946666666666667
Accuracy Score of SVM with linear kernal function  0.9933333333333333
Accuracy Score of SVM with sigmoid kernal function  0.7133333333333334
Accuracy Score of SVM with polynomial kernal function with degree 2  0.9893333333333333
Accuracy Score of SVM with polynomial kernal function with degree 3  0.9893333333333333


In [25]:
try_naive(x,y["label_3"],x_valid,y_valid["label_3"])

Accuracy Score of Naive Bayes with Gaussian Naive Bayes 0.46266666666666667
Accuracy Score of Naive Bayes with MultinomialNB 0.8106666666666666
Accuracy Score of Naive Bayes with ComplementNB 0.5906666666666667
Accuracy Score of Naive Bayes with CategoricalNB 0.8106666666666666
Accuracy Score of Naive Bayes with BernoulliNB 0.808


In [26]:
try_decisionTree(x,y["label_3"],x_valid,y_valid["label_3"])

Accuracy Score of Decision Tree 0.884


In [27]:
try_random_forest(x,y["label_3"],x_valid,y_valid["label_3"])

Accuracy Score of Random Forest 0.964


In [14]:
try_xgBoost(x,y["label_3"],x_valid,y_valid["label_3"])

Accuracy Score of XG Boost 0.988


When considering validation accuracy results of label 4, KNN, Logistic Regression and SVM  provided good accuracy. Since accuracies are close to 0. We can use Linear SVM

# Label 4

# Handling Class Imbalance using SMOTE 

In [20]:
y["label_4"].value_counts()

6     19938
2      1449
0       955
12      954
7       938
13      482
1       481
11      480
10      480
3       479
5       478
9       472
4       469
8       465
Name: label_4, dtype: int64

In [None]:
pip install imbalanced-learn


In [93]:
#Since 6 value affects class balance property. We decided to undersample it.
from imblearn.over_sampling  import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(x, y["label_4"])

In [94]:
y_resampled.value_counts()
#However resampled values gave poor performance. So I avoided using them.

6     19938
13    19938
4     19938
5     19938
1     19938
2     19938
7     19938
3     19938
0     19938
12    19938
9     19938
8     19938
11    19938
10    19938
Name: label_4, dtype: int64

In [21]:
try_knn(x,y["label_4"],x_valid,y_valid["label_4"])

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Hyperparameters: {'n_neighbors': 7, 'weights': 'distance'}
Accuracy for KNN 0.8933333333333333


In [22]:
try_logistic(x,y["label_4"],x_valid,y_valid["label_4"])

Accuracy Score of Logistic Regression with solver lbfgs 0.8226666666666667
Accuracy Score of Logistic Regression with solver newton-cg 0.8226666666666667
Accuracy Score of Logistic Regression with solver sag 0.8226666666666667
Accuracy Score of Logistic Regression with solver saga 0.8226666666666667


In [23]:
try_svm(x,y["label_4"],x_valid,y_valid["label_4"])

Accuracy Score of SVM with default settings (RBF - exponential kernal)  0.776
Accuracy Score of Linear SVM (one-vs-the-rest)  0.8786666666666667
Accuracy Score of SVM with linear kernal function  0.852
Accuracy Score of SVM with sigmoid kernal function  0.668
Accuracy Score of SVM with polynomial kernal function with degree 2  0.7746666666666666
Accuracy Score of SVM with polynomial kernal function with degree 3  0.7746666666666666


In [21]:
try_naive(x,y["label_4"],x_valid,y_valid["label_4"])

Accuracy Score of Naive Bayes with Gaussian Naive Bayes 0.084
Accuracy Score of Naive Bayes with MultinomialNB 0.6053333333333333
Accuracy Score of Naive Bayes with ComplementNB 0.532
Accuracy Score of Naive Bayes with CategoricalNB 0.7093333333333334
Accuracy Score of Naive Bayes with BernoulliNB 0.7106666666666667


In [22]:
try_decisionTree(x,y["label_4"],x_valid,y_valid["label_4"])

Accuracy Score of Decision Tree 0.6453333333333333


In [23]:
try_random_forest(x,y["label_4"],x_valid,y_valid["label_4"])

Accuracy Score of Random Forest 0.7533333333333333


In [18]:
try_xgBoost(x,y["label_4"],x_valid,y_valid["label_4"])

Accuracy Score of XG Boost 0.8853333333333333


When considering validation accuracy results of label 4, KNN, XG Boost and SVM  provided good accuracy. We can use ensambled method consisting above classifiers to predict values for our test data. 

# Label 2

# Handling Missing Values 

Since i am handling each feature separately, I thought removing the rows which have empty values for label 2, is the best way to handle the missing values for label 2 as no artifical entries will not be added.

In [16]:
data_label_2 = train.dropna(subset=['label_2'])
valid_label_2 = valid.dropna(subset=['label_2'])
y_label_2 = data_label_2[["label_1","label_2","label_3","label_4"]] 
x_label_2 = data_label_2.drop(y_label_2, axis=1)
y_label_2_valid = valid_label_2[["label_1","label_2","label_3","label_4"]] 
x_label_2_valid = valid_label_2.drop(y_label_2_valid, axis=1)

In [25]:
try_knn(x_label_2,y_label_2["label_2"],x_label_2_valid,y_label_2_valid["label_2"])

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Hyperparameters: {'n_neighbors': 3, 'weights': 'distance'}
Accuracy for KNN 0.7853260869565217


In [26]:
try_logistic(x_label_2,y_label_2["label_2"],x_label_2_valid,y_label_2_valid["label_2"])

Accuracy Score of Logistic Regression with solver lbfgs 0.6005434782608695
Accuracy Score of Logistic Regression with solver newton-cg 0.6005434782608695
Accuracy Score of Logistic Regression with solver sag 0.6005434782608695
Accuracy Score of Logistic Regression with solver saga 0.6005434782608695


In [16]:
try_svm(x_label_2,y_label_2["label_2"],x_label_2_valid,y_label_2_valid["label_2"])

Accuracy Score of SVM with default settings (RBF - exponential kernal)  0.6100543478260869
Accuracy Score of Linear SVM (one-vs-the-rest)  0.7554347826086957
Accuracy Score of SVM with linear kernal function  0.7377717391304348
Accuracy Score of SVM with sigmoid kernal function  0.1331521739130435
Accuracy Score of SVM with polynomial kernal function with degree 2  0.5176630434782609
Accuracy Score of SVM with polynomial kernal function with degree 3  0.5176630434782609


In [17]:
try_naive(x_label_2,y_label_2["label_2"],x_label_2_valid,y_label_2_valid["label_2"])

Accuracy Score of Naive Bayes with Gaussian Naive Bayes 0.12771739130434784
Accuracy Score of Naive Bayes with MultinomialNB 0.14266304347826086
Accuracy Score of Naive Bayes with ComplementNB 0.18478260869565216
Accuracy Score of Naive Bayes with CategoricalNB 0.16168478260869565
Accuracy Score of Naive Bayes with BernoulliNB 0.16032608695652173


In [18]:
try_decisionTree(x_label_2,y_label_2["label_2"],x_label_2_valid,y_label_2_valid["label_2"])

Accuracy Score of Decision Tree 0.29483695652173914


In [19]:
try_random_forest(x_label_2,y_label_2["label_2"],x_label_2_valid,y_label_2_valid["label_2"])

Accuracy Score of Random Forest 0.6875


In [17]:
try_xgBoost(x_label_2,y_label_2["label_2"],x_label_2_valid,y_label_2_valid["label_2"])

Accuracy Score of XG Boost 0.7989130434782609


Here KNN, Linear SVM, XG Boost models performed well beating other models. So here we can use ensamble of those models.

# Evaluating test data set

In [22]:
test = pd.read_csv('test.csv')

In [23]:
test = test.drop("ID", axis =1)

In [24]:
#Checking provided data
test.isnull().sum()

feature_1      0
feature_2      0
feature_3      0
feature_4      0
feature_5      0
              ..
feature_764    0
feature_765    0
feature_766    0
feature_767    0
feature_768    0
Length: 768, dtype: int64

In [25]:
X_test_contiguous = np.ascontiguousarray(test)

# Label 1 - Linear SVM Model

In [30]:
clf = svm.LinearSVC(dual="auto")
clf.fit(x, y["label_1"])
X_test_contiguous = np.ascontiguousarray(test)
y_pred_label_1 = clf.predict(X_test_contiguous)

# Label 3 - SVM Model

In [31]:
clf = svm.LinearSVC(dual="auto")
clf.fit(x, y["label_3"])
y_pred_label_3 = clf.predict(X_test_contiguous)

# Label 2 -  Ensamble Method of SVM, XG Boost, KNN 

In [40]:
from sklearn.ensemble import VotingClassifier

# Initialize the models
knn_model = KNeighborsClassifier(n_neighbors=3)
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  # Set the objective for multi-class classification
    num_class=len(np.unique(y_label_2)),  # Number of classes
    random_state=42
    )

ensemble_model = VotingClassifier(estimators=[
    ('knn', knn_model),
    ('xgb', xgb_model)
], voting='hard')

le = LabelEncoder()
y_label_21 = le.fit_transform(y_label_2["label_2"])
ensemble_model.fit(x_label_2, y_label_21)
y_pred_21 = ensemble_model.predict(np.ascontiguousarray(x_label_2_valid))
y_pred_21 = le.inverse_transform(y_pred_21)
ensemble_accuracy = accuracy_score(y_label_2_valid["label_2"], y_pred_21)
print(ensemble_accuracy)
#ensemble_model.fit(x_label_2, train_y_encoded_2)
#ensemble_predictions = ensemble_model.predict(X_test_contiguous)

0.8233695652173914


In [42]:
y_pred_label_2 = ensemble_model.predict(X_test_contiguous)
y_pred_label_2 = le.inverse_transform(y_pred_label_2)

# Label 4 - Ensamble Method of SVM, Logistic Regression, KNN

Checking Using Validation data

In [54]:
from sklearn.ensemble import VotingClassifier

# Initialize the models
knn_model = KNeighborsClassifier(n_neighbors=3)
svm_model = svm.LinearSVC(dual="auto")
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  # Set the objective for multi-class classification
    num_class=len(np.unique(y["label_4"])),  # Number of classes
    random_state=42
    )

ensemble_model = VotingClassifier(estimators=[
    ('knn', knn_model),
    ('svm', svm_model),
    ('xgb', xgb_model)
], voting='hard')

le = LabelEncoder()
y_label_21 = le.fit_transform(y["label_4"])
ensemble_model.fit(x, y_label_21)
y_pred_21 = ensemble_model.predict(np.ascontiguousarray(x_valid))
y_pred_21 = le.inverse_transform(y_pred_21)
ensemble_accuracy = accuracy_score(y_valid["label_4"], y_pred_21)
print(ensemble_accuracy)
#ensemble_model.fit(x_label_2, train_y_encoded_2)
#ensemble_predictions = ensemble_model.predict(X_test_contiguous)

0.8893333333333333


Since Ensamble accuracy is greater than individual accuracy, I decide to use this method for test data.

In [47]:
y_pred_label_4 = ensemble_model.predict(X_test_contiguous)
y_pred_label_4 = le.inverse_transform(y_pred_label_4)

In [48]:
combined_df = pd.concat([ pd.Series(y_pred_label_1,name='label_1'), pd.Series(y_pred_label_2,name='label_2'), pd.Series(y_pred_label_3,name='label_3'), pd.Series(y_pred_label_4,name='label_4')], axis=1)

# Reset the index and add an incrementing ID column
combined_df.reset_index(drop=True, inplace=True)
combined_df['ID'] = combined_df.index + 1

# Reorder the columns with 'ID' as the leftmost column
combined_df = combined_df[['ID'] + [col for col in combined_df.columns if col != 'ID']]


In [49]:
combined_df

Unnamed: 0,ID,label_1,label_2,label_3,label_4
0,1,26,22.0,0,2
1,2,18,25.0,1,6
2,3,16,30.0,1,6
3,4,7,27.0,1,6
4,5,58,29.0,0,6
...,...,...,...,...,...
739,740,20,24.0,1,6
740,741,35,24.0,1,2
741,742,54,27.0,1,6
742,743,38,32.0,1,12


In [50]:
# Specify the file path where you want to save the CSV file
file_path = "190199A_layer_11.csv"  # Replace 'your_file_name.csv' with the desired file name and path

# Use the to_csv method to save the DataFrame to a CSV file
combined_df.to_csv(file_path, index=False)  # Set index to False if you don't want to save the index column