In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

data = pd.read_csv('pr_metrics.csv')

features = ['Time_to_Review', 'Comments', 'Additions', 'Deletions', 'Total_Changes', 'CI_Status', 'Superseded']
X = data[features]
y = data['Is_Merged']

# Preprocessing: Handle categorical variables and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Time_to_Review', 'Comments', 'Additions', 'Deletions', 'Total_Changes', 'CI_Status', 'Superseded']),
    ])

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))
print("Classification Report:\n", classification_report(y_test, y_pred))

model = pipeline.named_steps['classifier']
if hasattr(model, 'coef_'):
    feature_names = pipeline.named_steps['preprocessor'].transformers_[0][2]
    feature_importance = pd.Series(model.coef_[0], index=feature_names).sort_values(ascending=False)
    print("Feature Importance:\n", feature_importance)
    

Accuracy: 0.8
ROC-AUC: 0.8112633181126333
Classification Report:
               precision    recall  f1-score   support

       False       0.73      0.41      0.52        27
        True       0.81      0.95      0.87        73

    accuracy                           0.80       100
   macro avg       0.77      0.68      0.70       100
weighted avg       0.79      0.80      0.78       100

Feature Importance:
 CI_Status         0.266399
Comments          0.256577
Deletions         0.046324
Total_Changes    -0.061001
Additions        -0.132055
Superseded       -0.503228
Time_to_Review   -2.086122
dtype: float64
