In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix


df = pd.read_csv("regression_output.csv")


# RISK ETÄ°KETLERÄ°NÄ° OLUÅžTUR(SADECE EÄžÄ°TÄ°M Ä°Ã‡Ä°N)


bins = [0, 50, 70, 100]
labels = ["Low", "Medium", "High"]

df["Risk_Category"] = pd.cut(
    df["AI Impact"], bins=bins, labels=labels, include_lowest=True
)

df = df.dropna(subset=["Risk_Category"])


X = df[
    [
        "Predicted_AI_Impact",   # ðŸ”— REGRESSION Ã‡IKIÅžI
        "Tasks",
        "AI models",
        "AI_Workload_Ratio",
        "Job titiles",
        "Domain"
    ]
]

y = df["Risk_Category"]

num_cols = [
    "Predicted_AI_Impact",
    "Tasks",
    "AI models",
    "AI_Workload_Ratio"
]

cat_cols = ["Job titiles", "Domain"]


#preprocessing

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)


model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=300,
        random_state=42
    ))
])


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

model.fit(X_train, y_train)


y_pred = model.predict(X_test)

print("\n--- CLASSIFICATION REPORT ---")
print(classification_report(y_test, y_pred))

print("\n--- CONFUSION MATRIX ---")
print(confusion_matrix(y_test, y_pred))


df["Model_Prediction"] = model.predict(X)


out = df[
    [
        "Job titiles",
        "Domain",
        "Tasks",
        "AI models",
        "AI_Workload_Ratio",
        "AI Impact",
        "Predicted_AI_Impact",
        "Risk_Category",
        "Model_Prediction"
    ]
]


out.to_csv("final_risk_predictions.csv", index=False)



--- CLASSIFICATION REPORT ---
              precision    recall  f1-score   support

        High       0.80      0.30      0.43        27
         Low       0.97      0.98      0.97       824
      Medium       0.70      0.71      0.70        89

    accuracy                           0.94       940
   macro avg       0.82      0.66      0.70       940
weighted avg       0.94      0.94      0.93       940


--- CONFUSION MATRIX ---
[[  8   3  16]
 [  2 811  11]
 [  0  26  63]]
