In [190]:
!pip install catboost



In [191]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [192]:
import warnings
warnings.filterwarnings('ignore')

In [193]:
url = "https://raw.githubusercontent.com/ProtikgoswamiCSE/dengue-fever-detection-in-Dhaka-city/refs/heads/main/dataset.csv"
df = pd.read_csv(url)

In [194]:
df

Unnamed: 0,Gender,Age,NS1,IgG,IgM,Area,AreaType,HouseType,District,Outcome
0,Female,45,0,0,0,Mirpur,Undeveloped,Building,Dhaka,0
1,Male,17,0,0,1,Chawkbazar,Developed,Building,Dhaka,0
2,Female,29,0,0,0,Paltan,Undeveloped,Other,Dhaka,0
3,Female,63,1,1,0,Motijheel,Developed,Other,Dhaka,1
4,Male,22,0,0,0,Gendaria,Undeveloped,Building,Dhaka,0
...,...,...,...,...,...,...,...,...,...,...
995,Female,16,1,1,0,New Market,Developed,Building,Dhaka,1
996,Male,41,1,1,0,Paltan,Undeveloped,Other,Dhaka,1
997,Male,45,0,0,1,Motijheel,Developed,Building,Dhaka,0
998,Female,19,1,1,1,Paltan,Undeveloped,Building,Dhaka,1


In [195]:
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder here if not already imported at the top

le = LabelEncoder()
for column in ['Gender', 'Area', 'AreaType', 'HouseType', 'District']:
    df[column] = le.fit_transform(df[column])

In [196]:
# ফিচার এবং টার্গেট আলাদা করা
X = df.drop('Outcome', axis=1)  # ফিচার (Outcome বাদে)
y = df['Outcome']  # টার্গেট

In [197]:
# Missing values হ্যান্ডল করা
# নিউমেরিক কলামের জন্য মিন দিয়ে ফিল করা
X = X.fillna(X.mean())

In [198]:
# ফিচার সিলেকশন: উচ্চ কোরিলেশনযুক্ত ফিচার বাদ দেওয়া
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
X = X.drop(to_drop, axis=1)

In [199]:
# Data preprocessing
# ডাটা প্রিপ্রসেসিং: ফিচার এবং টার্গেট আলাদা করা
X = df.iloc[:, :-1]  # ফিচার
y = df.iloc[:, -1]   # টার্গেট (ধরে নিচ্ছি শেষ কলাম টার্গেট)

In [200]:
# Missing values handle kora

# Identify numeric columns
numeric_cols = X.select_dtypes(include=np.number).columns

# Fill missing values only in numeric columns with their mean
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())

# Non-numeric columns are not handled for missing values here.
# The next cell correctly addresses missing values in non-numeric columns using the mode.
# The current cell only needs to focus on numeric columns.

# You should also inspect the non-numeric columns for missing values
# and handle them appropriately (e.g., with mode or

In [201]:
# Missing values handle kora

# Identify numeric and non-numeric columns
numeric_cols = X.select_dtypes(include=np.number).columns
non_numeric_cols = X.select_dtypes(exclude=np.number).columns

# Fill missing values only in numeric columns with their mean
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())

# Handle missing values in non-numeric columns (example: fill with mode or a constant)
for col in non_numeric_cols:
    if X[col].isnull().any():
        # Example: Fill with the mode (most frequent value)
        # Using try-except block to handle potential errors if the mode is NaN itself (unlikely for strings)
        try:
            mode_val = X[col].mode()[0]
            X[col] = X[col].fillna(mode_val)
        except IndexError:
            # Handle case where mode() is empty (e.g., all NaNs), although fillna handles this.
            # Or choose another strategy like filling with 'Unknown'
            X[col] = X[col].fillna('Unknown')


# Convert non-numeric columns to numeric using one-hot encoding
# This step should happen BEFORE calculating correlation
X = pd.get_dummies(X, drop_first=True) # drop_first=True avoids multicollinearity


# ফিচার সিলেকশন (গুরুত্বপূর্ণ ফিচার বেছে নেওয়া)
# কোরিলেশন চেক করে অপ্রয়োজনীয় ফিচার বাদ দেওয়া
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
X = X.drop(to_drop, axis=1)

# The rest of the code remains the same from the original notebook.
# Data split kora (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Note: You no longer need the pd.get_dummies calls after splitting,
# as the one-hot encoding was already applied to the full X.
# However, you might still need to ensure consistent columns between train and test
# after splitting if some categories appear only in one split.
# The existing code to handle missing columns after splitting is still relevant.


# Ensure that the test set has the same columns as the training set after one-hot encoding
# This handles cases where a category might be present in train but not in test or vice-versa
# and ensures consistent feature sets for scaling and model training.
train_cols = X_train.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0

missing_in_train = set(test_cols) - set(train_cols)
# This case is less likely after encoding the full X, but keeping it is safer
for c in missing_in_train:
     X_train[c] = 0 # Or handle appropriately if a column exists in test but not train

# Ensure the column order is the same
X_test = X_test[train_cols]


In [202]:
# Data split করা (80% ট্রেনিং, 20% টেস্ট, stratify যোগ করে)
# stratify ব্যবহার করা হচ্ছে যাতে ক্লাস ব্যালেন্স থাকে
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [203]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [204]:
# Model evaluation function with cross-validation
# মডেল ইভালুয়েশন ফাংশন, ক্রস-ভ্যালিডেশন সহ
def evaluate_model(model, name):
    # ট্রেনিং এবং টেস্ট সেটে পারফরম্যান্স
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # 5-fold ক্রস-ভ্যালিডেশন
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

    print(f"\n{name} Model Results:")
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1-Score: {f1:.4f}")
    print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [205]:
# 1. Logistic Regression Model
# সহজ এবং লিনিয়ার সম্পর্কের জন্য ভালো মডেল
log_reg = LogisticRegression(random_state=42, max_iter=1000, C=0.1)
evaluate_model(log_reg, "Logistic Regression")


Logistic Regression Model Results:
Test Accuracy: 0.9850
Test Precision: 0.9855
Test Recall: 0.9850
Test F1-Score: 0.9850
Cross-Validation Accuracy: 0.9860 (+/- 0.0194)


In [206]:
# 2. Decision Tree Model
# ডাটাকে গাছের মতো সিদ্ধান্ত নিয়ে ক্লাসিফাই করে
dt = DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=10)
evaluate_model(dt, "Decision Tree")


Decision Tree Model Results:
Test Accuracy: 0.9750
Test Precision: 0.9751
Test Recall: 0.9750
Test F1-Score: 0.9750
Cross-Validation Accuracy: 0.9850 (+/- 0.0179)


In [207]:
# 3. Random Forest Model
# অনেকগুলো ডিসিশন ট্রি একসাথে কাজ করে, ওভারফিটিং কমায়
rf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=5, min_samples_split=10)
evaluate_model(rf, "Random Forest")


Random Forest Model Results:
Test Accuracy: 0.9850
Test Precision: 0.9855
Test Recall: 0.9850
Test F1-Score: 0.9850
Cross-Validation Accuracy: 0.9860 (+/- 0.0194)


In [208]:
# 4. K-Nearest Neighbors (KNN) Model
# প্রতিবেশী পয়েন্টের উপর ভিত্তি করে ক্লাসিফিকেশন
knn = KNeighborsClassifier(n_neighbors=5)
evaluate_model(knn, "K-Nearest Neighbors")


K-Nearest Neighbors Model Results:
Test Accuracy: 0.9850
Test Precision: 0.9855
Test Recall: 0.9850
Test F1-Score: 0.9850
Cross-Validation Accuracy: 0.5990 (+/- 0.0595)


In [209]:
# 5. Support Vector Machine (SVM) Model
# হাইপারপ্লেন তৈরি করে ডাটা ক্লাসিফাই করে
svm = SVC(random_state=42, C=0.1, kernel='rbf')
evaluate_model(svm, "Support Vector Machine")


Support Vector Machine Model Results:
Test Accuracy: 0.9850
Test Precision: 0.9855
Test Recall: 0.9850
Test F1-Score: 0.9850
Cross-Validation Accuracy: 0.5330 (+/- 0.0049)


In [210]:
# 6. Naive Bayes Model
# প্রোবাবিলিটির উপর ভিত্তি করে, দ্রুত এবং সহজ
nb = GaussianNB()
evaluate_model(nb, "Naive Bayes")


Naive Bayes Model Results:
Test Accuracy: 0.9850
Test Precision: 0.9855
Test Recall: 0.9850
Test F1-Score: 0.9850
Cross-Validation Accuracy: 0.9860 (+/- 0.0194)


In [211]:
# 7. Gradient Boosting Model
# ধীরে ধীরে শিখে ভুল কমায়, শক্তিশালী মডেল
gb = GradientBoostingClassifier(random_state=42, learning_rate=0.01, n_estimators=200, max_depth=3)
evaluate_model(gb, "Gradient Boosting")


Gradient Boosting Model Results:
Test Accuracy: 0.9850
Test Precision: 0.9855
Test Recall: 0.9850
Test F1-Score: 0.9850
Cross-Validation Accuracy: 0.9860 (+/- 0.0194)


In [212]:
# 8. XGBoost Model
# দ্রুত এবং উন্নত বুস্টিং মডেল
xgb = XGBClassifier(random_state=42, learning_rate=0.01, n_estimators=200, max_depth=3, eval_metric='logloss')
evaluate_model(xgb, "XGBoost")


XGBoost Model Results:
Test Accuracy: 0.9850
Test Precision: 0.9855
Test Recall: 0.9850
Test F1-Score: 0.9850
Cross-Validation Accuracy: 0.9860 (+/- 0.0194)


In [213]:
# 9. LightGBM Model
# দ্রুত এবং কম মেমোরি ব্যবহার করে, ওভারফিটিং কমানোর জন্য প্যারামিটার টিউন করা
lgbm = LGBMClassifier(random_state=42, max_depth=5, learning_rate=0.01, n_estimators=200, min_child_samples=50, num_leaves=15)
evaluate_model(lgbm, "LightGBM")

[LightGBM] [Info] Number of positive: 426, number of negative: 374
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 112
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.532500 -> initscore=0.130184
[LightGBM] [Info] Start training from score 0.130184
[LightGBM] [Info] Number of positive: 427, number of negative: 373
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.533750 -> initscore=0.135206
[LightGBM] [Info] Sta

In [214]:
# 10. CatBoost Model
# ক্যাটেগরিকাল ফিচারের জন্য ভালো, স্বয়ংক্রিয় ফিচার হ্যান্ডলিং
catboost = CatBoostClassifier(random_state=42, verbose=0, depth=5, learning_rate=0.01, iterations=200)
evaluate_model(catboost, "CatBoost")


CatBoost Model Results:
Test Accuracy: 0.9850
Test Precision: 0.9855
Test Recall: 0.9850
Test F1-Score: 0.9850
Cross-Validation Accuracy: 0.9860 (+/- 0.0194)


In [215]:
# 11. AdaBoost Model
# দুর্বল মডেলকে একত্রিত করে শক্তিশালী মডেল তৈরি
adaboost = AdaBoostClassifier(random_state=42, n_estimators=100, learning_rate=0.1)
evaluate_model(adaboost, "AdaBoost")


AdaBoost Model Results:
Test Accuracy: 0.9850
Test Precision: 0.9855
Test Recall: 0.9850
Test F1-Score: 0.9850
Cross-Validation Accuracy: 0.9860 (+/- 0.0194)


In [216]:
# 12. ANN (Artificial Neural Network) / MLP (Multi-Layer Perceptron)
# নিউরাল নেটওয়ার্ক, জটিল প্যাটার্ন শিখতে পারে
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=42, learning_rate_init=0.01)
evaluate_model(mlp, "MLP (Artificial Neural Network)")


MLP (Artificial Neural Network) Model Results:
Test Accuracy: 0.9750
Test Precision: 0.9751
Test Recall: 0.9750
Test F1-Score: 0.9750
Cross-Validation Accuracy: 0.9860 (+/- 0.0194)


In [None]:
# 13. DNN (Deep Neural Network) using TensorFlow
# গভীর নিউরাল নেটওয়ার্ক, আরো শক্তিশালী
dnn = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])
dnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
dnn.fit(X_train, y_train, epochs=30, batch_size=32, verbose=0)
y_pred_dnn = (dnn.predict(X_test) > 0.5).astype(int)
print("\nDNN (Deep Neural Network) Results:")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_dnn):.4f}")
print(f"Test Precision: {precision_score(y_test, y_pred_dnn, average='weighted'):.4f}")
print(f"Test Recall: {recall_score(y_test, y_pred_dnn, average='weighted'):.4f}")
print(f"Test F1-Score: {f1_score(y_test, y_pred_dnn, average='weighted'):.4f}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step

In [None]:
# 14. AutoEncoder for Feature Extraction + Logistic Regression
# ফিচার কম্প্রেশনের জন্য অটোএনকোডার, তারপর ক্লাসিফিকেশন
input_dim = X_train.shape[1]
autoencoder = Sequential([
    Input(shape=(input_dim,)),
    Dense(16, activation='relu'),
    Dense(8, activation='relu'),
    Dense(16, activation='relu'),
    Dense(input_dim, activation='sigmoid')
])
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train, X_train, epochs=30, batch_size=32, verbose=0)

In [None]:
# Encoder part for feature extraction
encoder = Sequential(autoencoder.layers[:2])
X_train_encoded = encoder.predict(X_train)
X_test_encoded = encoder.predict(X_test)

In [None]:
# Logistic Regression on encoded features
log_reg_auto = LogisticRegression(random_state=42, max_iter=1000, C=0.1)
evaluate_model(log_reg_auto, "AutoEncoder + Logistic Regression")

In [None]:
import pandas as pd
import io

data = """Feature,Importance
feature_1,100
feature_2,80
feature_3,50
"""

df_feature_importance = pd.read_csv(io.StringIO(data))
print(df_feature_importance)

In [None]:
df_feature_importance = df_feature_importance[df_feature_importance['Importance'] > 0]
df_feature_importance.to_csv('feature_importance_filtered.csv', index=False)

In [None]:
# ফিচার ইম্পর্ট্যান্স প্লট করা (LightGBM-এর জন্য)
# Ensure this block is executed *after* the LightGBM model is trained in evaluate_model
if hasattr(lgbm, 'feature_importances_') and not X.empty:
    plt.figure(figsize=(10, 6))
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns, # Again, assumes X.columns matches the trained model's features
        'Importance': lgbm.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    plt.bar(feature_importance_df['Feature'], feature_importance_df['Importance'])
    plt.xticks(rotation=90, fontsize=8)
    plt.title("Feature Importance (LightGBM)")
    plt.tight_layout()
    filename = 'feature_importance_lightgbm.png'
    plt.savefig(filename)
    plt.show()
    try:
        from google.colab import files
        files.download(filename)
    except ImportError:
        print(f"PNG ফাইল আপনার কারেন্ট ডিরেক্টরিতে সেভ হয়েছে।")
    plt.close()
else:
    print("LightGBM model not trained or feature importances not available for plotting.")

In [None]:
# prompt: feature_importance_Logistic_Regression ar valu print

# Assuming log_reg has been trained and the feature names (X.columns) are available.
# The coefficients for Logistic Regression are stored in model.coef_.
# For binary classification, coef_ is a 1D array.
# For multi-class classification, coef_ is a 2D array (n_classes, n_features).
# Here, assuming binary classification based on the previous code setting up the target 'Outcome'.

# Ensure the Logistic Regression model is trained before accessing coef_
# If you just ran the evaluate_model('Logistic Regression') function, the model is trained inside it.
# To access the trained model outside the function, you would need to train it directly or modify the function to return the model.
# Let's train it directly here to ensure we have the object.
log_reg = LogisticRegression(random_state=42, max_iter=1000, C=0.1)
log_reg.fit(X_train, y_train)

# Get the coefficients
coefficients = log_reg.coef_[0] # Access the first row for binary classification

# Get the feature names from the columns of the processed data X
feature_names = X.columns

# Create a DataFrame to store feature importance (absolute value of coefficients)
feature_importance_lr = pd.DataFrame({
    'Feature': feature_names,
    'Importance': np.abs(coefficients)
})

# Sort by importance
feature_importance_lr = feature_importance_lr.sort_values(by='Importance', ascending=False)

# Print the feature importance DataFrame
print("\nFeature Importance for Logistic Regression:")
print(feature_importance_lr)

# Optional: Save the feature importance to a CSV file
filename_lr = 'feature_importance_logistic_regression.csv'
feature_importance_lr.to_csv(filename_lr, index=False)
print(f"\nLogistic Regression ফিচার ইম্পর্ট্যান্স '{filename_lr}' ফাইলে সেভ হয়েছে।")

try:
    # Only download if running in Colab
    if 'google.colab' in str(get_ipython()):
         files.download(filename_lr)
except ImportError:
    print(f"CSV ফাইল আপনার কারেন্ট ডিরেক্টরিতে সেভ হয়েছে।")

# You can also visualize this if desired, similar to the LightGBM plotting code.
# import matplotlib.pyplot as plt
# plt.figure(figsize=(10, 6))
# plt.bar(feature_importance_lr['Feature'], feature_importance_lr['Importance'])
# plt.xticks(rotation=90, fontsize=8)
# plt.title("Feature Importance (Logistic Regression - Absolute Coefficients)")
# plt.tight_layout()
# plt.show()
# plt.close()

