In [2]:
# === Importing Libraries ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils import resample

# === Data Preparation ===
data = {
    "Feature1": [5, np.nan, 10, 15, 20, 25, np.nan],
    "Feature2": [1, 2, 3, 4, 5, 6, 7],
    "Target": [0, 1, 0, 1, 0, 1, 0]
}
df = pd.DataFrame(data)

# === Imputation ===
imputer = SimpleImputer(strategy='mean')
df['Feature1'] = imputer.fit_transform(df[['Feature1']])

# === Handling Outliers ===
df['Feature1'] = np.clip(df['Feature1'], a_min=0, a_max=20)

# === Feature Scaling ===
scaler = StandardScaler()
df[['Feature1', 'Feature2']] = scaler.fit_transform(df[['Feature1', 'Feature2']])

# === Feature Selection ===
X = df[['Feature1', 'Feature2']]
y = df['Target']
selector = SelectKBest(score_func=f_classif, k=1)
X_selected = selector.fit_transform(X, y)

# === Data Sampling ===
# Probability sampling
df_sampled = df.sample(frac=0.5, random_state=42)

# Non-probability sampling
df_non_prob_sampled = df[df['Feature1'] > 0]

# === Handling Imbalanced Dataset ===
df_majority = df[df['Target'] == 0]
df_minority = df[df['Target'] == 1]
df_minority_upsampled = resample(df_minority, 
                                 replace=True, 
                                 n_samples=len(df_majority), 
                                 random_state=42)
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# === Splitting the Data ===
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# === Cross Validating ===
model = RandomForestClassifier()
cv = StratifiedKFold(n_splits=min(5, y_train.value_counts().min()), shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=cv)
print("Cross-validation scores:", cv_scores)

# === Classification ===
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# === Binary Classification Metrics ===
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Binary Classification Metrics:")
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

# === Regression ===
data_regression = {
    "Feature1": [1, 2, 3, 4, 5],
    "Feature2": [5, 6, 7, 8, 9],
    "Target": [10, 12, 14, 16, 18]
}
df_regression = pd.DataFrame(data_regression)

X_reg = df_regression[['Feature1', 'Feature2']]
y_reg = df_regression['Target']
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

reg_model = LinearRegression()
reg_model.fit(X_train_reg, y_train_reg)
y_pred_reg = reg_model.predict(X_test_reg)

# === Regression Evaluation Metrics ===
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)
print("Regression Metrics:")
print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R-squared:", r2)

Cross-validation scores: [0.33333333 0.5       ]
Binary Classification Metrics:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Accuracy: 0.5
Regression Metrics:
MSE: 0.0
RMSE: 0.0
MAE: 0.0
R-squared: nan


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
