In [None]:
https://colab.research.google.com/drive/1s5YrG5AmJMdWHWErsPy39vQOOtGJkdlU?usp=sharing

In [None]:
# importing pandas library for data interpretation
import pandas as pd
# importing train_test_split function to split dataset into two parts: testing and training
from sklearn.model_selection import train_test_split
# using stackingclassifier to combine predictions of multiple base classifier using meta classifier i.e. logistic regression
# using randomforest classifier to improve accuracy and reduce overfitting through decision tree
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
# using logistic regression as meta classifier
from sklearn.linear_model import LogisticRegression
# to calculate precision, accuracy ,f1 score
from sklearn.metrics import accuracy_score, classification_report


# Load the Dataset

In [2]:
# reading data
data = pd.read_csv('C:\Users\HP\Downloads\Code\dna.csv')
data.head(10)
# the dataset contains DNA sequence which is representing by 181 column in which 'class' column is representing which class it represents:
# class 1 represents:EI: Exon-Intron boundaries (donors)
# class 2 represents:IE: Intron-Exon boundaries (recipients)
# class 3 represents:Neither: Neither exon nor intron
# here other 180 columns are representing 60 nucleotides which has 3 binary indicator variables

Unnamed: 0,A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,A171,A172,A173,A174,A175,A176,A177,A178,A179,class
0,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,1,0,0,3
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,3
2,0,0,1,0,0,1,0,1,0,0,...,0,1,0,0,0,1,0,0,1,3
3,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,1
4,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,1,0,1,0,0,2
5,0,1,0,0,0,0,0,0,1,1,...,0,0,1,0,1,0,0,1,0,2
6,0,0,1,1,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,1,1
7,1,0,0,1,0,0,0,0,1,0,...,1,0,0,1,0,0,1,0,0,3
8,0,0,0,1,0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,1,3
9,0,0,0,0,0,1,1,0,0,0,...,1,0,0,0,1,0,1,0,0,3


# Explore and Preprocess Data

In [3]:
# data size
data.size

576666

In [4]:
# handling missing values as there can be quite missing values which can create problems
MissingValues = data.isnull().sum()
print("Missing values in each column:\n", MissingValues)


Missing values in each column:
 A0       0
A1       0
A2       0
A3       0
A4       0
        ..
A176     0
A177     0
A178     0
A179     0
class    0
Length: 181, dtype: int64


In [5]:
# Since class labels start from 1, adjusting them to start from 0 for compatibility.
data['class'] = data['class'] - 1  # Adjusting class labels to start from 0

In [6]:
# Checking the distribution of adjusted class labels
print("Distribution of adjusted class labels:\n", data['class'].value_counts())


Distribution of adjusted class labels:
 class
2    1654
0     767
1     765
Name: count, dtype: int64


# Split Data into Features and Target

In [7]:
# splitting the data into feature and target by removing class column from dataset and make different data name y and making target as class
X = data.drop('class', axis=1)
y = data['class']


In [8]:
# Spliting Data into Training and Test Sets where 80% is training data and 20% is testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define and Train Stacking Model


In [9]:
# define base models--Random Forest classifier and a Gradient Boosting classifier, both with 100 estimators and a random state of 42.
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
]

# Define meta-model
meta_model = LogisticRegression()

# Initialize StackingClassifier with base models and meta-model
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# Train the stacking model on the training data
stacking_model.fit(X_train, y_train)


In [22]:
# Make predictions with the stacking model
y_pred_stack = stacking_model.predict(X_test)

# creating a dataframe containing actual class and predicted class
results_df = pd.DataFrame({'Actual_Class': y_test, 'Predicted_Class': y_pred_stack})
# Display the results
print("Predictions for the testing data:")
print(results_df)


Predictions for the testing data:
      Actual_Class  Predicted_Class
1029             0                0
1001             0                0
785              2                2
411              2                2
1105             1                1
...            ...              ...
2623             2                2
693              0                0
2465             2                2
3022             2                2
1356             2                2

[638 rows x 2 columns]


In [11]:
# Generate predictions using the trained stacking model
y_pred_stack = stacking_model.predict(X_test)

# Evaluate the performance of the stacking model
print("Accuracy with Stacking:", accuracy_score(y_test, y_pred_stack))
print("\nClassification Report with Stacking:\n", classification_report(y_test, y_pred_stack))


Accuracy with Stacking: 0.9655172413793104

Classification Report with Stacking:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96       153
           1       0.94      0.94      0.94       168
           2       0.99      0.97      0.98       317

    accuracy                           0.97       638
   macro avg       0.96      0.96      0.96       638
weighted avg       0.97      0.97      0.97       638

