In [73]:
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_openml
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, precision_score, \
              recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score, \
              auc, mean_squared_error
import os
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   type_school            1000 non-null   object 
 1   school_accreditation   1000 non-null   object 
 2   gender                 1000 non-null   object 
 3   interest               1000 non-null   object 
 4   residence              1000 non-null   object 
 5   parent_age             1000 non-null   int64  
 6   parent_salary          1000 non-null   int64  
 7   house_area             1000 non-null   float64
 8   average_grades         1000 non-null   float64
 9   parent_was_in_college  1000 non-null   bool   
 10  in_college             1000 non-null   bool   
dtypes: bool(2), float64(2), int64(2), object(5)
memory usage: 72.4+ KB


In [4]:
data.head()

Unnamed: 0,type_school,school_accreditation,gender,interest,residence,parent_age,parent_salary,house_area,average_grades,parent_was_in_college,in_college
0,Academic,A,Male,Less Interested,Urban,56,6950000,83.0,84.09,False,True
1,Academic,A,Male,Less Interested,Urban,57,4410000,76.8,86.91,False,True
2,Academic,B,Female,Very Interested,Urban,50,6500000,80.6,87.43,False,True
3,Vocational,B,Male,Very Interested,Rural,49,6600000,78.2,82.12,True,True
4,Academic,A,Female,Very Interested,Urban,57,5250000,75.1,86.79,False,False


In [5]:
x = data.drop(['in_college'], axis = 1)
y = data['in_college']

In [6]:
data = data.drop(['in_college'], axis = 1)

In [37]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   type_school            1000 non-null   object 
 1   school_accreditation   1000 non-null   object 
 2   gender                 1000 non-null   object 
 3   interest               1000 non-null   object 
 4   residence              1000 non-null   object 
 5   parent_age             1000 non-null   int64  
 6   parent_salary          1000 non-null   int64  
 7   house_area             1000 non-null   float64
 8   average_grades         1000 non-null   float64
 9   parent_was_in_college  1000 non-null   bool   
dtypes: bool(1), float64(2), int64(2), object(5)
memory usage: 71.4+ KB


In [38]:
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = data.select_dtypes(include=['object', 'bool']).columns

In [39]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(x, y, random_state=0)

In [40]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [41]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [42]:
lr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', LinearRegression())])

In [43]:
lr.fit(X_train, y_train) 
lr.score(X_test, y_test)

0.5063702867700096

In [48]:
from sklearn.linear_model import LogisticRegression


In [49]:
logReg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',logReg )])

In [51]:
logReg.fit(X_train, y_train) 
logReg.score(X_test, y_test)

0.836

In [53]:
from sklearn.linear_model import SGDClassifier

In [54]:
SGD_cl = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', SGDClassifier())])

In [55]:
SGD_cl.fit(X_train, y_train) 
SGD_cl.score(X_test, y_test)

model score: 0.816


In [62]:
from sklearn.tree import DecisionTreeClassifier

In [66]:
dt_cl = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', DecisionTreeClassifier())])

In [67]:
dt_cl.fit(X_train, y_train)  
dt_cl.score(X_test, y_test)

0.82

In [69]:
from sklearn.svm import SVC

In [70]:
svc_cl = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', SVC())])

In [72]:
svc_cl.fit(X_train, y_train) 
svc_cl.score(X_test, y_test)

0.884