In [11]:
# Step 1: Import libraries and Load dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset
df = pd.read_csv("../datasets/Indian_Kids_Screen_Time.csv")

# Drop the 'Health_Impacts' column as it's complex and not suitable for this model
if 'Health_Impacts' in df.columns:
    df.drop("Health_Impacts", axis=1, inplace=True)

# Check for missing values (Your dataset is clean, so no filling is needed)
print(f"Missing values in any column: {df.isnull().any().any()}\n")


# Encode categorical variables
categorical_cols = ['Gender', 'Primary_Device', 'Urban_or_Rural']
for col in categorical_cols:
    if col in df.columns:
        label_enc = LabelEncoder()
        # Make a copy to avoid SettingWithCopyWarning
        df.loc[:, col] = label_enc.fit_transform(df[col])
        print(f"Encoded column: '{col}'")
    else:
        print(f"Warning: Column '{col}' not found. Skipping encoding.")


# Define the target variable
target_col = 'Exceeded_Recommended_Limit'

if target_col in df.columns:
    # Split features and target
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    # Create a new DataFrame with scaled features for clarity
    X = pd.DataFrame(X_scaled, columns=X.columns)


    # Train-test split
    # Use stratification to maintain the same proportion of target classes in train and test sets
    if len(X) > 1:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        print("\n✅ Data Preprocessing Done")
        print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
    else:
        print("Error: Not enough data to perform train-test split.")
else:
    print(f"Error: Target column '{target_col}' not found in the dataset.")

Missing values in any column: False

Encoded column: 'Gender'
Encoded column: 'Primary_Device'
Encoded column: 'Urban_or_Rural'

✅ Data Preprocessing Done
Train shape: (7769, 6) Test shape: (1943, 6)


In [12]:
# Step 2: Data Preprocessing (Refined)

# Encode categorical columns
categorical_cols = ['Gender', 'Primary_Device', 'Urban_or_Rural']
label_encoders = {}

for col in categorical_cols:
    if col in df.columns:
        le = LabelEncoder()
        # Use .loc to ensure the operation modifies the DataFrame directly
        df.loc[:, col] = le.fit_transform(df[col])
        label_encoders[col] = le  # Save encoders if needed later
        print(f"Encoded column: '{col}'")
    else:
        print(f"Warning: Column '{col}' not found. Skipping encoding.")

# Define the target variable
target_col = 'Exceeded_Recommended_Limit'

if target_col in df.columns:
    # Split features and target
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # Train-Test Split (before scaling)
    if len(X) > 1:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Scale the features (fit on train, transform on both train and test)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        print("\n✅ Data preprocessing completed!")
        print("Training set shape:", X_train.shape)
        print("Test set shape:", X_test.shape)
    else:
        print("Error: Not enough data to perform train-test split.")
else:
    print(f"Error: Target column '{target_col}' not found in the dataset.")

Encoded column: 'Gender'
Encoded column: 'Primary_Device'
Encoded column: 'Urban_or_Rural'

✅ Data preprocessing completed!
Training set shape: (7769, 6)
Test set shape: (1943, 6)


In [13]:
# --- Step 3: Implement Random Forest Classifier ---

print("--- Training Random Forest Classifier ---")

# Check if training and test data are available (good practice)
if 'X_train' in locals() and 'X_test' in locals():
    # Initialize Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model
    rf.fit(X_train, y_train)

    # Predictions
    y_pred_rf = rf.predict(X_test)

    # Evaluation
    print("\nRandom Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
    print("\nClassification Report:\n", classification_report(y_test, y_pred_rf, target_names=['Not Exceeded', 'Exceeded']))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
else:
    print("Training and test data not available. Please run the preprocessing steps first.")


--- Training Random Forest Classifier ---

Random Forest Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

Not Exceeded       1.00      1.00      1.00       282
    Exceeded       1.00      1.00      1.00      1661

    accuracy                           1.00      1943
   macro avg       1.00      1.00      1.00      1943
weighted avg       1.00      1.00      1.00      1943


Confusion Matrix:
 [[ 282    0]
 [   0 1661]]


In [14]:
# Train-Test Split (before scaling)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("--- Data Preprocessing Complete ---\n")


# --- Step 3: Implement various classifiers with Boosting ---

print("--- Training and Evaluating Classifiers ---")

# Check if training and test data are available
if 'X_train' in locals() and 'X_test' in locals():
    # Define models
    models = {
        "KNN": KNeighborsClassifier(n_neighbors=5),
        "Naive Bayes": GaussianNB(),
        "Boosted DT": AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1, random_state=42), n_estimators=50, random_state=42),
        "Boosted RF": AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=10, max_depth=1, random_state=42), n_estimators=50, random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
    }

    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results[name] = acc
        print(f"{name} Accuracy: {acc:.4f}\n")

    # Comparison table
    comparison = pd.DataFrame.from_dict(results, orient='index', columns=['Accuracy'])
    print("\n--- Final Comparison of Classifiers ---")
    print(comparison)
else:
    print("Training and test data not available. Please run the preprocessing steps first.")

--- Data Preprocessing Complete ---

--- Training and Evaluating Classifiers ---
Training KNN...
KNN Accuracy: 0.9650

Training Naive Bayes...
Naive Bayes Accuracy: 0.9156

Training Boosted DT...
Boosted DT Accuracy: 1.0000

Training Boosted RF...
Boosted RF Accuracy: 1.0000

Training Gradient Boosting...
Gradient Boosting Accuracy: 1.0000


--- Final Comparison of Classifiers ---
                   Accuracy
KNN                0.965003
Naive Bayes        0.915594
Boosted DT         1.000000
Boosted RF         1.000000
Gradient Boosting  1.000000


In [17]:
print("--- Training Boosting Classifiers ---")

# Check if training and test data are available
if 'X_train' in locals() and 'X_test' in locals():
    # AdaBoost
    ada = AdaBoostClassifier(n_estimators=20, random_state=42)
    ada.fit(X_train, y_train)
    y_pred_ada = ada.predict(X_test)

    # Gradient Boosting
    gb = GradientBoostingClassifier(n_estimators=20, random_state=42)
    gb.fit(X_train, y_train)
    y_pred_gb = gb.predict(X_test)

    print("AdaBoost Accuracy:", accuracy_score(y_test, y_pred_ada))
    print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
else:
    print("Training and test data not available. Please run the preprocessing steps first.")


--- Training Boosting Classifiers ---
AdaBoost Accuracy: 1.0
Gradient Boosting Accuracy: 1.0
