In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering

data = pd.read_csv('outputs/features_train_labeled.csv')
feat_cols = [c for c in data.columns if c not in ['id','bug_type','species']]
X = data[feat_cols].values
y = data['bug_type'].astype(str).values

# Répartition classes
sns.countplot(x=data['bug_type'])
plt.title("Répartition des bug types")
plt.show()

# PCA
Xz = StandardScaler().fit_transform(X)
pca2 = PCA(n_components=2).fit_transform(Xz)
plt.scatter(pca2[:,0], pca2[:,1], c=pd.factorize(y)[0])
plt.title("PCA (2D)")
plt.show()

# t-SNE & UMAP
ts = TSNE(n_components=2, init='pca', perplexity=30, learning_rate='auto').fit_transform(Xz)
plt.scatter(ts[:,0], ts[:,1], c=pd.factorize(y)[0]); plt.title("t-SNE"); plt.show()

u = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=0).fit_transform(Xz)
plt.scatter(u[:,0], u[:,1], c=pd.factorize(y)[0]); plt.title("UMAP"); plt.show()

# Modèles supervisés
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

models = {
    "LogReg": LogisticRegression(max_iter=2000),
    "SVM_lin": SVC(kernel='linear', probability=True),
    "SVM_rbf": SVC(kernel='rbf', probability=True),
    "RF": RandomForestClassifier(n_estimators=400, random_state=0)
}
for name, clf in models.items():
    scores = cross_val_score(clf, Xz, y, cv=cv, scoring='f1_macro')
    print(name, "F1_macro:", scores.mean().round(3), "+/-", scores.std().round(3))

# GridSearch (méthode optimisée exigée)
param = {'C':[0.1,1,10], 'gamma':['scale','auto'], 'kernel':['rbf']}
gs = GridSearchCV(SVC(probability=True), param_grid=param, scoring='f1_macro', cv=cv, n_jobs=-1)
gs.fit(Xz, y)
print("Best SVM:", gs.best_params_, gs.best_score_.round(3))

# Clustering
for k in [2,3,4,5]:
    km = KMeans(n_clusters=k, n_init='auto').fit(Xz)
    print("KMeans", k, "inertia:", km.inertia_)
agg = AgglomerativeClustering(n_clusters=3, linkage='ward').fit(Xz)
print("Agglomerative labels (head):", np.unique(agg.labels_)[:5])
