In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression


# Load the Dataset

In [45]:
data = pd.read_csv('/kaggle/input/splicejunction-gene-sequences-dataset/dna.csv')


# Explore and Preprocess Data

In [47]:
# Display the first few rows of the dataset to understand its structure
print(data.head())


   A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  ...  A171  A172  A173  A174  A175  \
0   0   1   0   0   0   0   1   0   0   0  ...     1     0     0     0     0   
1   0   0   1   0   0   1   0   0   0   0  ...     0     0     0     1     0   
2   0   0   1   0   0   1   0   1   0   0  ...     0     1     0     0     0   
3   0   0   0   0   0   0   0   1   0   0  ...     0     0     1     0     0   
4   0   1   0   0   0   0   0   1   0   0  ...     0     1     0     0     1   

   A176  A177  A178  A179  class  
0     1     1     0     0      2  
1     0     0     1     0      2  
2     1     0     0     1      2  
3     1     0     0     1      0  
4     0     1     0     0      1  

[5 rows x 181 columns]


In [49]:
# Check for missing values in the dataset
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)


Missing values in each column:
 A0       0
A1       0
A2       0
A3       0
A4       0
        ..
A176     0
A177     0
A178     0
A179     0
class    0
Length: 181, dtype: int64


In [50]:
# Since class labels start from 1, adjust them to start from 0 for compatibility with most ML algorithms
data['class'] = data['class'] - 1  # Adjusting class labels to start from 0

In [51]:
# Check the distribution of adjusted class labels
print("Distribution of adjusted class labels:\n", data['class'].value_counts())


Distribution of adjusted class labels:
 class
 1    1654
-1     767
 0     765
Name: count, dtype: int64


# Split Data into Features and Target

In [52]:
X = data.drop('class', axis=1)
y = data['class']


# Split Data into Training and Test Sets

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define and Train Stacking Model


In [54]:
# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
]

# Define meta-model
meta_model = LogisticRegression()

# Initialize StackingClassifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# Fit the stacking model
stacking_model.fit(X_train, y_train)


# Make Predictions and Evaluate the Model python


In [55]:
# Make predictions with the stacking model
y_pred_stack = stacking_model.predict(X_test)

# Evaluate the model
print("Accuracy with Stacking:", accuracy_score(y_test, y_pred_stack))
print("\nClassification Report with Stacking:\n", classification_report(y_test, y_pred_stack))


Accuracy with Stacking: 0.9655172413793104

Classification Report with Stacking:
               precision    recall  f1-score   support

          -1       0.94      0.98      0.96       153
           0       0.94      0.94      0.94       168
           1       0.99      0.97      0.98       317

    accuracy                           0.97       638
   macro avg       0.96      0.96      0.96       638
weighted avg       0.97      0.97      0.97       638

