In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('data/data.csv')
df.head(3)

Unnamed: 0,iq,prev_sem_result,cgpa,academic_performance,internship_experience,extra_curricular_score,communication_skills,projects_completed,placement
0,107,6.61,6.28,8,No,8,8,4,No
1,97,5.52,5.37,8,No,7,8,0,No
2,109,5.36,5.83,9,No,3,1,1,No


In [4]:
X = df.drop("placement",axis=1)
y = df['placement']

In [5]:
X.head()

Unnamed: 0,iq,prev_sem_result,cgpa,academic_performance,internship_experience,extra_curricular_score,communication_skills,projects_completed
0,107,6.61,6.28,8,No,8,8,4
1,97,5.52,5.37,8,No,7,8,0
2,109,5.36,5.83,9,No,3,1,1
3,122,5.47,5.75,6,Yes,1,6,1
4,96,7.91,7.69,7,No,8,10,2


In [6]:
num_features = X.select_dtypes(include='number').columns
cat_features = X.select_dtypes(exclude='number').columns

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

num_transformer = StandardScaler()
ohe_transformer = OneHotEncoder()
le_transformer = LabelEncoder()

preprocessor = ColumnTransformer([
    ("LabelEncoder", ohe_transformer, cat_features),
    ("StandardScaler", num_transformer, num_features),
])

In [7]:
X = preprocessor.fit_transform(X)
y = le_transformer.fit_transform(y)

In [8]:
X.shape

(10000, 9)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 9), (2000, 9), (8000,), (2000,))

In [None]:
def evaluate_model(true, predicted):
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true,predicted)
    r2_square = r2_score(true,predicted)

    return rmse, mae, r2_square

In [13]:
models = {
    "LogisticRegression": LogisticRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'CatBoostClassifier': CatBoostClassifier(),
    'XGBClassifier': XGBClassifier(),
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]

    model.fit(X_train,y_train)

    yhat_train = model.predict(X_train)
    yhat_test = model.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, yhat_train)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, yhat_test)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Performance for Training set")
    print('- Root Mean Squared Error: {:.4f}'.format(model_train_rmse))
    print('- Mean Absolute Error: {:.4f}'.format(model_train_mae))
    print('- R2 Score: {:.8f}'.format(model_train_r2))

    print('--------------------------------------------------------------')

    print("Model Performance for Testing set")
    print('- Root Mean Squared Error: {:.4f}'.format(model_test_rmse))
    print('- Mean Absolute Error: {:.4f}'.format(model_test_mae))
    print('- R2 Score: {:.8f}'.format(model_test_r2))
    r2_list.append(model_test_r2)

    print('='*35)
    print('\n')

LogisticRegression
Model Performance for Training set
- Root Mean Squared Error: 0.0974
- Mean Absolute Error: 0.3120
- R2 Score: 0.29323172
--------------------------------------------------------------
Model Performance for Testing set
- Root Mean Squared Error: 0.1055
- Mean Absolute Error: 0.3248
- R2 Score: 0.25054921


Lasso
Model Performance for Training set
- Root Mean Squared Error: 0.2756
- Mean Absolute Error: 0.3712
- R2 Score: 0.00000000
--------------------------------------------------------------
Model Performance for Testing set
- Root Mean Squared Error: 0.2786
- Mean Absolute Error: 0.3752
- R2 Score: -0.00014385


Ridge
Model Performance for Training set
- Root Mean Squared Error: 0.2379
- Mean Absolute Error: 0.3008
- R2 Score: 0.34336771
--------------------------------------------------------------
Model Performance for Testing set
- Root Mean Squared Error: 0.2443
- Mean Absolute Error: 0.3084
- R2 Score: 0.32416735


KNeighborsClassifier
Model Performance for T

In [14]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name','R2_Score']).sort_values(by='R2_Score', ascending=False)

Unnamed: 0,Model Name,R2_Score
4,DecisionTreeClassifier,1.0
8,XGBClassifier,1.0
7,CatBoostClassifier,1.0
6,AdaBoostClassifier,1.0
5,RandomForestClassifier,1.0
3,KNeighborsClassifier,0.552461
2,Ridge,0.324167
0,LogisticRegression,0.250549
1,Lasso,-0.000144


In [15]:
best_model = DecisionTreeClassifier()
best_model.fit(X_train,y_train)
yhat_test = best_model.predict(X_test)
score = r2_score(y_test, yhat_test)*100
print("Accuracy of the model is %.2f" %score)

Accuracy of the model is 100.00
