# Q33: Predictive Model for Practical Scenario

- Build a predictive model for a practical scenario (student performance, loan approval, disease prediction).
- Emphasize data wrangling, preprocessing, feature selection, and result interpretation.

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# Simulate a practical dataset (e.g., disease prediction)
X, y = make_classification(n_samples=1000, n_features=12, n_informative=8, n_redundant=2, n_classes=2, random_state=42)
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(12)])
df['target'] = y

In [3]:
# Data wrangling: introduce some missing values
for col in df.columns[:5]:
    df.loc[df.sample(frac=0.05, random_state=42).index, col] = np.nan

In [4]:
# Preprocessing: fill missing values and scale features
df.fillna(df.mean(), inplace=True)
X = df.drop('target', axis=1)
y = df['target']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
# Feature selection
selector = SelectKBest(score_func=f_classif, k=6)
X_selected = selector.fit_transform(X_scaled, y)
selected_features = X.columns[selector.get_support()]
print('Selected features:', list(selected_features))

Selected features: ['feature_1', 'feature_3', 'feature_4', 'feature_9', 'feature_10', 'feature_11']


In [6]:
# Train predictive model
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [7]:
# Result interpretation
print('Classification Report:')
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.59      0.63       110
           1       0.56      0.64      0.60        90

    accuracy                           0.61       200
   macro avg       0.62      0.62      0.61       200
weighted avg       0.62      0.61      0.62       200

