In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import scipy.stats as stats
import plotly.graph_objects as go
import plotly.express as px
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, accuracy_score
import plotly.express as px
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')

sns.set(style="whitegrid")
%matplotlib inline

In [None]:
# input libraries

In [None]:
#step 2 load dataset
from google.colab import files
uploaded = files.upload()

TypeError: 'NoneType' object is not subscriptable

In [None]:
df = pd.read_csv('student_depression_dataset.csv')


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
# step 3 data cleaning and preprocessing

In [None]:
# Convert 'Depression' to integer (if not already)
df['Depression'] = df['Depression'].astype(int)

# Convert columns that should be categorical
cat_cols = ['Gender', 'City', 'Profession', 'Degree',
            'Have you ever had suicidal thoughts ?',
            'Family History of Mental Illness']
for col in cat_cols:
    df[col] = df[col].astype('category')

# Check unique values in some columns to decide cleaning strategy
print("Unique values in 'Sleep Duration':", df['Sleep Duration'].unique())
print("Unique values in 'Financial Stress':", df['Financial Stress'].unique())

In [None]:
# Define a function to extract numeric hours from Sleep Duration column
def extract_hours(s):
    # Find a number (including decimals)
    match = re.search(r"(\d+(\.\d+)?)", str(s))
    return float(match.group(1)) if match else np.nan

df['Sleep Duration'] = df['Sleep Duration'].apply(extract_hours)

# Convert Financial Stress to categorical if it represents levels (e.g., Low, Medium, High)
df['Financial Stress'] = df['Financial Stress'].astype('category')

# Verify changes
print(df[['Sleep Duration', 'Financial Stress']].head())

In [None]:
# Display missing values per column
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

In [None]:
for col in ['Sleep Duration']:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

In [None]:
# step 4 exploratory data analysis

plt.figure(figsize=(8,5))
sns.countplot(x='Depression', data=df, palette="magma")
plt.title("Distribution of Depression among Students")
plt.xlabel("Depression (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='Gender', hue='Depression', data=df, palette="Set1")
plt.title("Depression Distribution by Gender")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.legend(title="Depression")
plt.show()

In [None]:

num_features = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA',
                'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Work/Study Hours']
df[num_features].hist(bins=20, figsize=(15,10))
plt.tight_layout()
plt.show()

In [None]:
# Check if 'Sleep Duration' is numerical before calculating correlation
if pd.api.types.is_numeric_dtype(df['Sleep Duration']):
    plt.figure(figsize=(10,8))
    num_cols = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA',
                'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Work/Study Hours']
    corr_matrix = df[num_cols].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='magma', fmt=".2f")
    plt.title("Correlation Heatmap")
    plt.show()
else:
    print("Error: 'Sleep Duration' column is not in a numerical format. Please run the data cleaning and preprocessing steps (specifically the cell converting 'Sleep Duration' to numbers) before calculating the correlation.")

In [None]:
fig = px.scatter(df, x="CGPA", y="Study Satisfaction", color="Depression",
                 hover_data=[ 'Academic Pressure'],
                 title="CGPA vs Study Satisfaction by Depression")
fig.show()

In [None]:
# Statistical Analysis
# Separate the groups
group_dep = df[df['Depression'] == 1]['Academic Pressure']
group_non_dep = df[df['Depression'] == 0]['Academic Pressure']

# Print group sizes
print("Depressed group size:", len(group_dep))
print("Non-depressed group size:", len(group_non_dep))

# Only perform the test if both groups have at least 3 observations
if len(group_dep) >= 3 and len(group_non_dep) >= 3:
    # T-test
    t_stat, p_val = stats.ttest_ind(group_dep, group_non_dep)
    print("T-test statistic: {:.3f}, p-value: {:.3f}".format(t_stat, p_val))

    # Mann-Whitney U test
    u_stat, p_val_u = stats.mannwhitneyu(group_dep, group_non_dep)
    print("Mann-Whitney U test statistic: {:.3f}, p-value: {:.3f}".format(u_stat, p_val_u))
else:
    print("One of the groups does not have enough observations for statistical testing.")

In [None]:
# Feature Engineering
df['Total Pressure'] = df['Academic Pressure'] + df['Work Pressure']

plt.figure(figsize=(8,5))
sns.boxplot(x='Depression', y='Total Pressure', data=df, palette="coolwarm")
plt.title("Total Pressure by Depression Status")
plt.xlabel("Depression (0 = No, 1 = Yes)")
plt.ylabel("Total Pressure")
plt.show()

In [None]:
# Select categorical columns to encode (excluding the target 'Depression')
cat_features = ['Gender', 'City', 'Profession', 'Degree',
                'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'Financial Stress']

# Use one-hot encoding
data_encoded = pd.get_dummies(df, columns=cat_features, drop_first=True)

data_encoded.head()

In [None]:
# machine learning model building
# Drop unwanted columns from the original dataframe
drop_cols = ['id', 'Depression', 'Have you ever had suicidal thoughts ?',
             'Family History of Mental Illness', 'Gender', 'City',
             'Profession', 'Degree', 'Financial Stress']
data_clean = df.drop(columns=drop_cols)

# Now encode only if you have any categorical variables left (or skip if all are numeric)
data_encoded = pd.get_dummies(data_clean, drop_first=True)

In [None]:
# Print columns after encoding for debugging purposes
print("Columns after encoding:", data_encoded.columns.tolist())

# Define keys for the original categorical columns that were encoded
cat_keys = ["Have you ever had suicidal thoughts ?", "Family History of Mental Illness",
            "Gender", "City", "Profession", "Degree", "Financial Stress"]

# Identify dummy columns that contain any of these keys
dummy_cols = [col for col in data_encoded.columns if any(key in col for key in cat_keys)]

# Build a list of columns to drop only if they exist in data_encoded
drop_cols = []
for col in ['id', 'Depression']:
    if col in data_encoded.columns:
        drop_cols.append(col)
drop_cols += dummy_cols  # Add dummy columns to drop list

# Drop the columns
X = data_encoded.drop(columns=drop_cols)
# Ensure target variable is correctly defined. If 'Depression' was dropped, use the original target.
if 'Depression' in data_encoded.columns:
    y = data_encoded['Depression']
else:
    # If 'Depression' is not in data_encoded, use it from the original data
    y = df['Depression'] # Use df here as data is not defined


# Standardize numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Dynamically identify numerical columns in X
numerical_cols_in_X = X.select_dtypes(include=np.number).columns.tolist()

# Ensure 'Total Pressure' is included if it exists and is numerical
if 'Total Pressure' in X.columns and pd.api.types.is_numeric_dtype(X['Total Pressure']):
    if 'Total Pressure' not in numerical_cols_in_X:
        numerical_cols_in_X.append('Total Pressure')

# Remove any non-numerical columns that might have been included by select_dtypes if necessary
# For this specific case, select_dtypes(include=np.number) should be sufficient

X[numerical_cols_in_X] = scaler.fit_transform(X[numerical_cols_in_X])


print("Feature matrix shape:", X.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Make predictions on the test set
y_pred = log_model.predict(X_test)

# Print classification report
print("Logistic Regression Classification Report:\n")
print(classification_report(y_test, y_pred))

# Confusion matrix for Logistic Regression
cm_log = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_log, annot=True, fmt="d", cmap='Blues')
plt.title("Logistic Regression Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC Curve for Logistic Regression
y_prob_log = log_model.predict_proba(X_test)[:,1]
fpr_log, tpr_log, thresholds_log = roc_curve(y_test, y_prob_log)
roc_auc_log = auc(fpr_log, tpr_log)
plt.figure(figsize=(8,6))
plt.plot(fpr_log, tpr_log, label=f'Logistic Regression ROC curve (AUC = {roc_auc_log:.2f})', color='blue')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Train Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

log_model = LogisticRegression(random_state=42)
log_model.fit(X_train, y_train)

print("Logistic Regression model trained successfully.")

# Make predictions on the test set
y_pred = log_model.predict(X_test)

# Print classification report
print("Logistic Regression Classification Report:\n")
print(classification_report(y_test, y_pred))

# Confusion matrix for Logistic Regression
cm_log = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_log, annot=True, fmt="d", cmap='Blues')
plt.title("Logistic Regression Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC Curve for Logistic Regression
y_prob_log = log_model.predict_proba(X_test)[:,1]
fpr_log, tpr_log, thresholds_log = roc_curve(y_test, y_prob_log)
roc_auc_log = auc(fpr_log, tpr_log)
plt.figure(figsize=(8,6))
plt.plot(fpr_log, tpr_log, label=f'Logistic Regression ROC curve (AUC = {roc_auc_log:.2f})', color='blue')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Confusion matrix for RF
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt="d", cmap='Greens')
plt.title("Random Forest Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC Curve for RF
y_prob_rf = rf_model.predict_proba(X_test)[:,1]
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_prob_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)
plt.figure(figsize=(8,6))
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest ROC curve (AUC = {roc_auc_rf:.2f})', color='green')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Random Forest')
plt.legend(loc="lower right")
plt.show()

In [None]:

# Cross-validation for Logistic Regression use cv=10
cv_scores_log = cross_val_score(log_model, X, y, cv=10, scoring='accuracy')
print("5-Fold CV Accuracy for Logistic Regression:", cv_scores_log)
print("Mean CV Accuracy:", cv_scores_log.mean())

# Cross-validation for Random Forest
cv_scores_rf = cross_val_score(rf_model, X, y, cv=10, scoring='accuracy')
print("5-Fold CV Accuracy for Random Forest:", cv_scores_rf)
print("Mean CV Accuracy:", cv_scores_rf.mean())

In [None]:
# Plot results
plt.figure(figsize=(8,5))
plt.plot(range(1, 11), cv_scores_rf, marker='o', linestyle='-', color='b', label='Fold Accuracy')
plt.axhline(np.mean(cv_scores_rf), color='r', linestyle='--', label=f'Mean Accuracy = {np.mean(cv_scores_rf):.2f}')
plt.xticks(range(1, 11))
plt.xlabel("Fold")
plt.ylabel("Accuracy")
plt.title("10-Fold Cross Validation Results")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot results
plt.figure(figsize=(8,5))
plt.plot(range(1, 11), cv_scores_log, marker='o', linestyle='-', color='red', label='Fold Accuracy')
plt.axhline(np.mean(cv_scores_log), color='r', linestyle='--', label=f'Mean Accuracy = {np.mean(cv_scores_log):.2f}')
plt.xticks(range(1, 11))
plt.xlabel("Fold")
plt.ylabel("Accuracy")
plt.title("10-Fold Cross Validation Results")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from sklearn.svm import SVC

In [None]:
# SVM Model
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred_svm = svm_model.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

# Confusion matrix for SVM
cm_svm = confusion_matrix(y_test, y_pred_svm)
sns.heatmap(cm_svm, annot=True, fmt="d", cmap='Blues')
plt.title("SVM Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC Curve for SVM
# SVC does not have predict_proba by default.
# To plot ROC curve, you would need to set probability=True in SVC,
# which can be computationally expensive.
# Skipping ROC curve for default SVC.

In [None]:
# Cross-validation for SVM
from sklearn.model_selection import cross_val_score
cv_scores_svm = cross_val_score(svm_model, X, y, cv=10, scoring='accuracy')
print("10-Fold CV Accuracy for SVM:", cv_scores_svm)
print("Mean CV Accuracy:", cv_scores_svm.mean())

# Plot results (will be combined with others)
# plt.figure(figsize=(8,5))
# plt.plot(range(1, 11), cv_scores_svm, marker='o', linestyle='-', color='b', label='Fold Accuracy')
# plt.axhline(np.mean(cv_scores_svm), color='r', linestyle='--', label=f'Mean Accuracy = {np.mean(cv_scores_svm):.2f}')
# plt.xticks(range(1, 11))
# plt.xlabel("Fold")
# plt.ylabel("Accuracy")
# plt.title("10-Fold Cross Validation Results - SVM")
# plt.legend()
# plt.grid(True)
# plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Import Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

# Train a Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

print("Gradient Boosting Classifier model trained successfully.")

# Make predictions on the test set
y_pred_gb = gb_model.predict(X_test)

# Print classification report
print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

# Confusion matrix for Gradient Boosting
cm_gb = confusion_matrix(y_test, y_pred_gb)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_gb, annot=True, fmt="d", cmap='Purples')
plt.title("Gradient Boosting Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


# Cross-validation for Gradient Boosting Classifier
cv_scores_gb = cross_val_score(gb_model, X, y, cv=10, scoring='accuracy')
print("\n10-Fold CV Accuracy for Gradient Boosting Classifier:", cv_scores_gb)
print("Mean CV Accuracy:", cv_scores_gb.mean())

# Plot cross-validation results for Gradient Boosting Classifier
plt.figure(figsize=(10, 6))

plt.plot(range(1, 11), cv_scores_svm, marker='o', linestyle='-', color='blue', label='SVM Fold Accuracy')
plt.axhline(np.mean(cv_scores_svm), color='darkblue', linestyle='--', label=f'SVM Mean Accuracy = {np.mean(cv_scores_svm):.2f}')

plt.plot(range(1, 11), cv_scores_dt, marker='o', linestyle='-', color='black', label='Decision Tree Fold Accuracy')
plt.axhline(np.mean(cv_scores_dt), color='grey', linestyle='--', label=f'Decision Tree Mean Accuracy = {np.mean(cv_scores_dt):.2f}')

plt.plot(range(1, 11), cv_scores_adaboost, marker='o', linestyle='-', color='orange', label='AdaBoost Fold Accuracy')
plt.axhline(np.mean(cv_scores_adaboost), color='darkorange', linestyle='--', label=f'AdaBoost Mean Accuracy = {np.mean(cv_scores_adaboost):.2f}')

plt.plot(range(1, 11), cv_scores_gb, marker='o', linestyle='-', color='purple', label='Gradient Boosting Fold Accuracy')
plt.axhline(np.mean(cv_scores_gb), color='darkviolet', linestyle='--', label=f'Gradient Boosting Mean Accuracy = {np.mean(cv_scores_gb):.2f}')


plt.xticks(range(1, 11))
plt.xlabel("Fold")
plt.ylabel("Accuracy")
plt.title("10-Fold Cross Validation Results - Multiple Models")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Import Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

# Train a Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

print("Decision Tree model trained successfully.")

# Cross-validation for Decision Tree
cv_scores_dt = cross_val_score(dt_model, X, y, cv=10, scoring='accuracy')
print("10-Fold CV Accuracy for Decision Tree:", cv_scores_dt)
print("Mean CV Accuracy:", cv_scores_dt.mean())

In [None]:
# Import AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier

# Train an AdaBoost model
# Using a Decision Tree as the base estimator (default)
adaboost_model = AdaBoostClassifier(random_state=42)
adaboost_model.fit(X_train, y_train)

print("AdaBoost Classifier model trained successfully.")

# Cross-validation for AdaBoost Classifier
cv_scores_adaboost = cross_val_score(adaboost_model, X, y, cv=10, scoring='accuracy')
print("10-Fold CV Accuracy for AdaBoost Classifier:", cv_scores_adaboost)
print("Mean CV Accuracy:", cv_scores_adaboost.mean())

In [None]:
# SVM Model
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

print("SVM model trained successfully.")

# Cross-validation for SVM
cv_scores_svm = cross_val_score(svm_model, X, y, cv=10, scoring='accuracy')
print("10-Fold CV Accuracy for SVM:", cv_scores_svm)
print("Mean CV Accuracy:", cv_scores_svm.mean())

In [None]:
# Import Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

# Train a Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

print("Gradient Boosting Classifier model trained successfully.")

# Make predictions on the test set
y_pred_gb = gb_model.predict(X_test)

# Print classification report
print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

# Confusion matrix for Gradient Boosting
cm_gb = confusion_matrix(y_test, y_pred_gb)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_gb, annot=True, fmt="d", cmap='Purples')
plt.title("Gradient Boosting Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


# Cross-validation for Gradient Boosting Classifier
cv_scores_gb = cross_val_score(gb_model, X, y, cv=10, scoring='accuracy')
print("\n10-Fold CV Accuracy for Gradient Boosting Classifier:", cv_scores_gb)
print("Mean CV Accuracy:", cv_scores_gb.mean())

# Plot cross-validation results for Gradient Boosting Classifier
plt.figure(figsize=(10, 6))

plt.plot(range(1, 11), cv_scores_svm, marker='o', linestyle='-', color='blue', label='SVM Fold Accuracy')
plt.axhline(np.mean(cv_scores_svm), color='darkblue', linestyle='--', label=f'SVM Mean Accuracy = {np.mean(cv_scores_svm):.2f}')

plt.plot(range(1, 11), cv_scores_dt, marker='o', linestyle='-', color='black', label='Decision Tree Fold Accuracy')
plt.axhline(np.mean(cv_scores_dt), color='grey', linestyle='--', label=f'Decision Tree Mean Accuracy = {np.mean(cv_scores_dt):.2f}')

plt.plot(range(1, 11), cv_scores_adaboost, marker='o', linestyle='-', color='orange', label='AdaBoost Fold Accuracy')
plt.axhline(np.mean(cv_scores_adaboost), color='darkorange', linestyle='--', label=f'AdaBoost Mean Accuracy = {np.mean(cv_scores_adaboost):.2f}')

plt.plot(range(1, 11), cv_scores_gb, marker='o', linestyle='-', color='purple', label='Gradient Boosting Fold Accuracy')
plt.axhline(np.mean(cv_scores_gb), color='darkviolet', linestyle='--', label=f'Gradient Boosting Mean Accuracy = {np.mean(cv_scores_gb):.2f}')


plt.xticks(range(1, 11))
plt.xlabel("Fold")
plt.ylabel("Accuracy")
plt.title("10-Fold Cross Validation Results - Multiple Models")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Import Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

# Train a Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

print("Gradient Boosting Classifier model trained successfully.")

# Make predictions on the test set
y_pred_gb = gb_model.predict(X_test)

# Print classification report
print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

# Confusion matrix for Gradient Boosting
cm_gb = confusion_matrix(y_test, y_pred_gb)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_gb, annot=True, fmt="d", cmap='Purples')
plt.title("Gradient Boosting Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


# Cross-validation for Gradient Boosting Classifier
cv_scores_gb = cross_val_score(gb_model, X, y, cv=10, scoring='accuracy')
print("\n10-Fold CV Accuracy for Gradient Boosting Classifier:", cv_scores_gb)
print("Mean CV Accuracy:", cv_scores_gb.mean())

# Plot cross-validation results for Gradient Boosting Classifier
plt.figure(figsize=(10, 6))

plt.plot(range(1, 11), cv_scores_svm, marker='o', linestyle='-', color='blue', label='SVM Fold Accuracy')
plt.axhline(np.mean(cv_scores_svm), color='darkblue', linestyle='--', label=f'SVM Mean Accuracy = {np.mean(cv_scores_svm):.2f}')

plt.plot(range(1, 11), cv_scores_dt, marker='o', linestyle='-', color='black', label='Decision Tree Fold Accuracy')
plt.axhline(np.mean(cv_scores_dt), color='grey', linestyle='--', label=f'Decision Tree Mean Accuracy = {np.mean(cv_scores_dt):.2f}')

plt.plot(range(1, 11), cv_scores_adaboost, marker='o', linestyle='-', color='orange', label='AdaBoost Fold Accuracy')
plt.axhline(np.mean(cv_scores_adaboost), color='darkorange', linestyle='--', label=f'AdaBoost Mean Accuracy = {np.mean(cv_scores_adaboost):.2f}')

plt.plot(range(1, 11), cv_scores_gb, marker='o', linestyle='-', color='purple', label='Gradient Boosting Fold Accuracy')
plt.axhline(np.mean(cv_scores_gb), color='darkviolet', linestyle='--', label=f'Gradient Boosting Mean Accuracy = {np.mean(cv_scores_gb):.2f}')


plt.xticks(range(1, 11))
plt.xlabel("Fold")
plt.ylabel("Accuracy")
plt.title("10-Fold Cross Validation Results - Multiple Models")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

# Print classification report
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

# Confusion matrix for Decision Tree
cm_dt = confusion_matrix(y_test, y_pred_dt)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_dt, annot=True, fmt="d", cmap='grey')
plt.title("Decision Tree Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Cross-validation for SVM
cv_scores_svm = cross_val_score(svm_model, X, y, cv=10, scoring='accuracy')
print("10-Fold CV Accuracy for SVM:", cv_scores_svm)
print("Mean CV Accuracy:", cv_scores_svm.mean())

In [None]:
from sklearn.metrics import classification_report

In [None]:
# Import Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

# Train a Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

print("Decision Tree model trained successfully.")

# Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

# Print classification report
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

# Confusion matrix for Decision Tree
cm_dt = confusion_matrix(y_test, y_pred_dt)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_dt, annot=True, fmt="d", cmap='grey')
plt.title("Decision Tree Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

# Print classification report
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

# Confusion matrix for Decision Tree
cm_dt = confusion_matrix(y_test, y_pred_dt)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_dt, annot=True, fmt="d", cmap='grey')
plt.title("Decision Tree Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Plot cross-validation results for Decision Tree
plt.figure(figsize=(8,5))
plt.plot(range(1, 11), cv_scores_dt, marker='o', linestyle='-', color='black', label='Fold Accuracy')
plt.axhline(np.mean(cv_scores_dt), color='red', linestyle='--', label=f'Mean Accuracy = {np.mean(cv_scores_dt):.2f}')
plt.xticks(range(1, 11))
plt.xlabel("Fold")
plt.ylabel("Accuracy")
plt.title("10-Fold Cross Validation Results - Decision Tree")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Cross-validation for Decision Tree
from sklearn.model_selection import cross_val_score
cv_scores_dt = cross_val_score(dt_model, X, y, cv=10, scoring='accuracy')
print("10-Fold CV Accuracy for Decision Tree:", cv_scores_dt)
print("Mean CV Accuracy:", cv_scores_dt.mean())

# Plot cross-validation results for Decision Tree (will be combined with others)
# plt.figure(figsize=(8,5))
# plt.plot(range(1, 11), cv_scores_dt, marker='o', linestyle='-', color='black', label='Fold Accuracy')
# plt.axhline(np.mean(cv_scores_dt), color='red', linestyle='--', label=f'Mean Accuracy = {np.mean(cv_scores_dt):.2f}')
# plt.xticks(range(1, 11))
# plt.xlabel("Fold")
# plt.ylabel("Accuracy")
# plt.title("10-Fold Cross Validation Results - Decision Tree")
# plt.legend()
# plt.grid(True)
# plt.show()

In [None]:
# Import AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier

# Train an AdaBoost model
# Using a Decision Tree as the base estimator (default)
adaboost_model = AdaBoostClassifier(random_state=42)
adaboost_model.fit(X_train, y_train)

print("AdaBoost Classifier model trained successfully.")

# Make predictions on the test set
y_pred_adaboost = adaboost_model.predict(X_test)

# Print classification report
print("AdaBoost Classification Report:")
print(classification_report(y_test, y_pred_adaboost))

# Confusion matrix for AdaBoost
cm_adaboost = confusion_matrix(y_test, y_pred_adaboost)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_adaboost, annot=True, fmt="d", cmap='Oranges')
plt.title("AdaBoost Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Make predictions on the test set
y_pred_adaboost = adaboost_model.predict(X_test)

# Print classification report
print("AdaBoost Classification Report:")
print(classification_report(y_test, y_pred_adaboost))

# Confusion matrix for AdaBoost
cm_adaboost = confusion_matrix(y_test, y_pred_adaboost)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_adaboost, annot=True, fmt="d", cmap='Oranges')
plt.title("AdaBoost Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Cross-validation for AdaBoost Classifier
from sklearn.model_selection import cross_val_score
cv_scores_adaboost = cross_val_score(adaboost_model, X, y, cv=10, scoring='accuracy')
print("10-Fold CV Accuracy for AdaBoost Classifier:", cv_scores_adaboost)
print("Mean CV Accuracy:", cv_scores_adaboost.mean())

# Plot cross-validation results for AdaBoost Classifier (will be combined with others)
# plt.figure(figsize=(8,5))
# plt.plot(range(1, 11), cv_scores_adaboost, marker='o', linestyle='-', color='orange', label='Fold Accuracy')
# plt.axhline(np.mean(cv_scores_adaboost), color='red', linestyle='--', label=f'Mean Accuracy = {np.mean(cv_scores_adaboost):.2f}')
# plt.xticks(range(1, 11))
# plt.xlabel("Fold")
# plt.ylabel("Accuracy")
# plt.title("10-Fold Cross Validation Results - AdaBoost Classifier")
# plt.legend()
# plt.grid(True)
# plt.show()

In [None]:
# Import Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

# Train a Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

print("Gradient Boosting Classifier model trained successfully.")

# Make predictions on the test set
y_pred_gb = gb_model.predict(X_test)

# Print classification report
print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

# Confusion matrix for Gradient Boosting
cm_gb = confusion_matrix(y_test, y_pred_gb)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_gb, annot=True, fmt="d", cmap='Purples')
plt.title("Gradient Boosting Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Cross-validation for Gradient Boosting Classifier
cv_scores_gb = cross_val_score(gb_model, X, y, cv=10, scoring='accuracy')
print("\n10-Fold CV Accuracy for Gradient Boosting Classifier:", cv_scores_gb)
print("Mean CV Accuracy:", cv_scores_gb.mean())

# Plot cross-validation results for Gradient Boosting Classifier
plt.figure(figsize=(8,5))
plt.plot(range(1, 11), cv_scores_gb, marker='o', linestyle='-', color='purple', label='Fold Accuracy')
plt.axhline(np.mean(cv_scores_gb), color='red', linestyle='--', label=f'Mean Accuracy = {np.mean(cv_scores_gb):.2f}')
plt.xticks(range(1, 11))
plt.xlabel("Fold")
plt.ylabel("Accuracy")
plt.title("10-Fold Cross Validation Results - Gradient Boosting Classifier")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Import AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier

# Train an AdaBoost model
# Using a Decision Tree as the base estimator (default)
adaboost_model = AdaBoostClassifier(random_state=42)
adaboost_model.fit(X_train, y_train)

print("AdaBoost Classifier model trained successfully.")

In [None]:
# Cross-validation for AdaBoost Classifier
cv_scores_adaboost = cross_val_score(adaboost_model, X, y, cv=10, scoring='accuracy')
print("10-Fold CV Accuracy for AdaBoost Classifier:", cv_scores_adaboost)
print("Mean CV Accuracy:", cv_scores_adaboost.mean())

In [None]:
# Import Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

# Train a Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

print("Gradient Boosting Classifier model trained successfully.")

# Make predictions on the test set
y_pred_gb = gb_model.predict(X_test)

# Print classification report
print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

# Confusion matrix for Gradient Boosting
cm_gb = confusion_matrix(y_test, y_pred_gb)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_gb, annot=True, fmt="d", cmap='Purples')
plt.title("Gradient Boosting Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


# Cross-validation for Gradient Boosting Classifier
cv_scores_gb = cross_val_score(gb_model, X, y, cv=10, scoring='accuracy')
print("\n10-Fold CV Accuracy for Gradient Boosting Classifier:", cv_scores_gb)
print("Mean CV Accuracy:", cv_scores_gb.mean())

# Plot cross-validation results for Gradient Boosting Classifier
plt.figure(figsize=(10, 6))

plt.plot(range(1, 11), cv_scores_svm, marker='o', linestyle='-', color='blue', label='SVM Fold Accuracy')
plt.axhline(np.mean(cv_scores_svm), color='darkblue', linestyle='--', label=f'SVM Mean Accuracy = {np.mean(cv_scores_svm):.2f}')

plt.plot(range(1, 11), cv_scores_dt, marker='o', linestyle='-', color='black', label='Decision Tree Fold Accuracy')
plt.axhline(np.mean(cv_scores_dt), color='grey', linestyle='--', label=f'Decision Tree Mean Accuracy = {np.mean(cv_scores_dt):.2f}')

plt.plot(range(1, 11), cv_scores_adaboost, marker='o', linestyle='-', color='orange', label='AdaBoost Fold Accuracy')
plt.axhline(np.mean(cv_scores_adaboost), color='darkorange', linestyle='--', label=f'AdaBoost Mean Accuracy = {np.mean(cv_scores_adaboost):.2f}')

plt.plot(range(1, 11), cv_scores_gb, marker='o', linestyle='-', color='purple', label='Gradient Boosting Fold Accuracy')
plt.axhline(np.mean(cv_scores_gb), color='darkviolet', linestyle='--', label=f'Gradient Boosting Mean Accuracy = {np.mean(cv_scores_gb):.2f}')


plt.xticks(range(1, 11))
plt.xlabel("Fold")
plt.ylabel("Accuracy")
plt.title("10-Fold Cross Validation Results - Multiple Models")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Cross-validation for Gradient Boosting Classifier
cv_scores_gb = cross_val_score(gb_model, X, y, cv=10, scoring='accuracy')
print("\n10-Fold CV Accuracy for Gradient Boosting Classifier:", cv_scores_gb)
print("Mean CV Accuracy:", cv_scores_gb.mean())