In [None]:
# Importando bibliotecas necessárias
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
# importando a base de dados

df = pd.read_csv('/content/Admission_PredictV2.csv')

In [None]:
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [None]:
# Criar uma nova coluna para representar a categoria de classificação
# 1 se Chance of Admit >= 0.8, 0 caso contrário
df['Admit_Category'] = (df['Chance of Admit '] >= 0.8).astype(int)

In [None]:
# Verificando o balanceamento do dataset
# Contar as ocorrências de cada categoria na coluna 'Admit_Category'
category_counts = df['Admit_Category'].value_counts()
category_counts


0    272
1    128
Name: Admit_Category, dtype: int64

In [None]:
# Descartar a coluna 'Serial No.' e 'Chance of Admit ' pois não são necessárias para a classificação
df_model = df.drop(['Serial No.', 'Chance of Admit '], axis=1)

In [None]:
# Separar as variáveis independentes (features) e a variável dependente (target)
X = df_model.drop('Admit_Category', axis=1)
y = df_model['Admit_Category']

In [None]:
# Dividir os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Padronizar os recursos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Aplicando SMOTE para balancear as classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)


In [None]:
# Dicionário para armazenar os melhores modelos e suas precisões
best_models_balanced = {}
best_accuracies_balanced = {}

In [None]:
# Lista de modelos e parâmetros para validação cruzada
models_params = [
    (LogisticRegression(), {'C': [0.1, 1, 10]}),
    (DecisionTreeClassifier(), {'max_depth': [3, 5, 7, 9]}),
    (RandomForestClassifier(), {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]}),
    (SVC(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}),
    (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]})
]

In [None]:
# Loop para ajustar cada modelo
for model, params in models_params:
    model_name = model.__class__.__name__
    grid_search = GridSearchCV(model, params, cv=5)
    grid_search.fit(X_train_balanced, y_train_balanced)
    best_models_balanced[model_name] = grid_search.best_estimator_

    # Avaliar a precisão no conjunto de teste
    y_pred = best_models_balanced[model_name].predict(X_test_scaled)
    best_accuracies_balanced[model_name] = accuracy_score(y_test, y_pred)

print(best_accuracies_balanced)

{'LogisticRegression': 0.9625, 'DecisionTreeClassifier': 0.9375, 'RandomForestClassifier': 0.9625, 'SVC': 0.9625, 'KNeighborsClassifier': 0.9}
