<a href="https://colab.research.google.com/github/The-Godfatherr/LAB-AIML/blob/main/Lab_11_Abhinav_Verma_E23CSEU1335.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import time

from google.colab import files
uploaded = files.upload()


Saving Raisin_Dataset.xlsx to Raisin_Dataset.xlsx


In [4]:
# Replace 'raisin_dataset.csv' with your uploaded filename if different
df = pd.read_excel('/content/Raisin_Dataset.xlsx')

print("First 5 rows:")
print(df.head())

print("\nCheck for null values:")
print(df.isnull().sum())

First 5 rows:
    Area  MajorAxisLength  MinorAxisLength  Eccentricity  ConvexArea  \
0  87524       442.246011       253.291155      0.819738       90546   
1  75166       406.690687       243.032436      0.801805       78789   
2  90856       442.267048       266.328318      0.798354       93717   
3  45928       286.540559       208.760042      0.684989       47336   
4  79408       352.190770       290.827533      0.564011       81463   

     Extent  Perimeter    Class  
0  0.758651   1184.040  Kecimen  
1  0.684130   1121.786  Kecimen  
2  0.637613   1208.575  Kecimen  
3  0.699599    844.162  Kecimen  
4  0.792772   1073.251  Kecimen  

Check for null values:
Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
Extent             0
Perimeter          0
Class              0
dtype: int64


In [5]:
# Convert Class labels to binary numeric values
df['Class'] = df['Class'].map({'Kecimen': 0, 'Besni': 1})

X = df.drop('Class', axis=1)
y = df['Class']

# Feature importance with Chi-Square test
chi_selector = SelectKBest(chi2, k='all')
chi_selector.fit(X, y)
scores = chi_selector.scores_
features = X.columns
feature_scores = pd.DataFrame({'Feature': features, 'Chi2_Score': scores})

print("Feature importance based on Chi-Square test:")
print(feature_scores.sort_values(by='Chi2_Score', ascending=False))

# Drop least important feature(s) if score below threshold (example threshold = 1)
lowest_score = feature_scores['Chi2_Score'].min()
if lowest_score < 1:
    drop_feature = feature_scores[feature_scores['Chi2_Score'] == lowest_score]['Feature'].values
    print(f"Dropping least important feature(s): {drop_feature}")
    X = X.drop(columns=drop_feature)
else:
    print("No feature dropped")


Feature importance based on Chi-Square test:
           Feature    Chi2_Score
4       ConvexArea  6.412753e+06
0             Area  6.097822e+06
6        Perimeter  2.563142e+04
1  MajorAxisLength  1.272952e+04
2  MinorAxisLength  2.234351e+03
3     Eccentricity  1.804260e+00
5           Extent  8.791728e-02
Dropping least important feature(s): ['Extent']


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [7]:
def evaluate_model(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"Confusion Matrix:\n{cm}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    return acc, prec, rec, f1


In [8]:
print("AdaBoost Classifier with default parameters:")
ada = AdaBoostClassifier(random_state=42)
start = time.time()
ada.fit(X_train, y_train)
end = time.time()
ada_time = end - start

y_pred_ada = ada.predict(X_test)
ada_metrics = evaluate_model(y_test, y_pred_ada)


AdaBoost Classifier with default parameters:
Confusion Matrix:
[[82  8]
 [12 78]]
Accuracy: 0.8889
Precision: 0.9070
Recall: 0.8667
F1 Score: 0.8864


In [10]:
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=3)]
}

grid_ada = GridSearchCV(AdaBoostClassifier(random_state=42), param_grid_ada, cv=5, scoring='accuracy', n_jobs=-1)
grid_ada.fit(X_train, y_train)

print(f"Best parameters: {grid_ada.best_params_}")

best_ada = grid_ada.best_estimator_
start = time.time()
best_ada.fit(X_train, y_train)
end = time.time()
ada_tuned_time = end - start

y_pred_ada_tuned = best_ada.predict(X_test)
ada_tuned_metrics = evaluate_model(y_test, y_pred_ada_tuned)

Best parameters: {'estimator': DecisionTreeClassifier(max_depth=3), 'learning_rate': 0.01, 'n_estimators': 200}
Confusion Matrix:
[[85  5]
 [20 70]]
Accuracy: 0.8611
Precision: 0.9333
Recall: 0.7778
F1 Score: 0.8485


In [11]:
print("Gradient Boosting Classifier with default parameters:")
grad = GradientBoostingClassifier(random_state=42)
start = time.time()
grad.fit(X_train, y_train)
end = time.time()
grad_time = end - start

y_pred_grad = grad.predict(X_test)
grad_metrics = evaluate_model(y_test, y_pred_grad)


Gradient Boosting Classifier with default parameters:
Confusion Matrix:
[[78 12]
 [14 76]]
Accuracy: 0.8556
Precision: 0.8636
Recall: 0.8444
F1 Score: 0.8539


In [12]:
param_grid_grad = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0]
}

grid_grad = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_grad, cv=5, scoring='accuracy', n_jobs=-1)
grid_grad.fit(X_train, y_train)

print(f"Best parameters: {grid_grad.best_params_}")

best_grad = grid_grad.best_estimator_
start = time.time()
best_grad.fit(X_train, y_train)
end = time.time()
grad_tuned_time = end - start

y_pred_grad_tuned = best_grad.predict(X_test)
grad_tuned_metrics = evaluate_model(y_test, y_pred_grad_tuned)


Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Confusion Matrix:
[[80 10]
 [16 74]]
Accuracy: 0.8556
Precision: 0.8810
Recall: 0.8222
F1 Score: 0.8506
