In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('dataset.csv', delimiter=';', encoding='utf-8')
df.columns = df.columns.str.strip()
print(df.head())
print(df.columns)
categorical_cols = ['Gender', 'Marital status', 'Course', 'Mother\'s qualification', 'Father\'s qualification']

   Marital status  Application mode  Application order  Course  \
0               1                17                  5     171   
1               1                15                  1    9254   
2               1                 1                  5    9070   
3               1                17                  2    9773   
4               2                39                  1    8014   

   Daytime/evening attendance  Previous qualification  \
0                           1                       1   
1                           1                       1   
2                           1                       1   
3                           1                       1   
4                           0                       1   

   Previous qualification (grade)  Nacionality  Mother's qualification  \
0                           122.0            1                      19   
1                           160.0            1                       1   
2                           122.0     

In [None]:
cat_imputer = SimpleImputer(strategy='most_frequent')  
for col in categorical_cols:
    if col in df.columns:  # Verifica si la columna existe en el DataFrame
        if df[col].isnull().any():
            print(f"Imputando columna: {col}")
            df[col] = cat_imputer.fit_transform(df[[col]])
    else:
        print(f"Columna no encontrada: {col}")

label_encoder = LabelEncoder()
for col in categorical_cols:
    if col in df.columns and df[col].dtype == 'object':  
        df[col] = label_encoder.fit_transform(df[col])

df['Target'] = df['Target'].fillna(df['Target'].mode()[0])  

print("Valores nulos por columna después de imputar 'Target':")
print(df.isnull().sum())


print("Valores nulos por columna después de imputar 'Target':")
print(df.isnull().sum())

X = df.drop('Target', axis=1)  # Todas las columnas excepto 'Target'
y = df['Target']  # La columna 'Target'

# Verificar que 'y' no tenga valores nulos
print("Valores nulos en 'y':", y.isnull().sum())
if y.isnull().sum() > 0:
    print("Existen valores nulos en 'y'. Imputando valores...")
    y.fillna(y.mode()[0], inplace=True)  # Imputar con la moda si hay NaN en 'y'

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Usar un Pipeline con un clasificador SVM
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[('classifier', SVC())])

# Definir el grid de hiperparámetros para GridSearchCV
param_grid = {'classifier__C': [0.1, 1, 10], 'classifier__kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# Ajustar el modelo usando GridSearchCV
grid_search.fit(X_train, y_train)

# Obtener el mejor modelo
best_svm = grid_search.best_estimator_

# Realizar predicciones
y_pred = best_svm.predict(X_test)

Valores nulos por columna después de imputar 'Target':
Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance                        0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarsh

In [None]:

print("Reporte de clasificación:")
print(classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Dropout', 'Graduate'], yticklabels=['Dropout', 'Graduate'])
plt.title("Matriz de Confusión - SVM")
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.show()

train_sizes, train_scores, test_scores = learning_curve(best_svm, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 5))
plt.plot(train_sizes, np.mean(train_scores, axis=1), label="Entrenamiento")
plt.plot(train_sizes, np.mean(test_scores, axis=1), label="Prueba")
plt.title("Curvas de Aprendizaje - SVM")
plt.xlabel("Tamaño del conjunto de entrenamiento")
plt.ylabel("Puntuación")
plt.legend()
plt.show()