In [117]:
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_openml
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import os
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt


In [80]:
data = pd.read_csv('data.csv')

In [81]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   type_school            1000 non-null   object 
 1   school_accreditation   1000 non-null   object 
 2   gender                 1000 non-null   object 
 3   interest               1000 non-null   object 
 4   residence              1000 non-null   object 
 5   parent_age             1000 non-null   int64  
 6   parent_salary          1000 non-null   int64  
 7   house_area             1000 non-null   float64
 8   average_grades         1000 non-null   float64
 9   parent_was_in_college  1000 non-null   bool   
 10  in_college             1000 non-null   bool   
dtypes: bool(2), float64(2), int64(2), object(5)
memory usage: 72.4+ KB


In [87]:
x = data.drop(['in_college'], axis = 1)
y = data['in_college']

In [104]:
data = data.drop(['in_college'], axis = 1)

In [105]:
data

Unnamed: 0,type_school,school_accreditation,gender,interest,residence,parent_age,parent_salary,house_area,average_grades,parent_was_in_college
0,Academic,A,Male,Less Interested,Urban,56,6950000,83.0,84.09,False
1,Academic,A,Male,Less Interested,Urban,57,4410000,76.8,86.91,False
2,Academic,B,Female,Very Interested,Urban,50,6500000,80.6,87.43,False
3,Vocational,B,Male,Very Interested,Rural,49,6600000,78.2,82.12,True
4,Academic,A,Female,Very Interested,Urban,57,5250000,75.1,86.79,False
...,...,...,...,...,...,...,...,...,...,...
995,Vocational,A,Female,Very Interested,Rural,49,7420000,63.6,85.99,True
996,Academic,B,Female,Less Interested,Rural,51,7480000,84.3,89.72,True
997,Vocational,A,Male,Less Interested,Urban,49,5550000,75.2,79.56,False
998,Academic,B,Male,Uncertain,Rural,53,5840000,105.8,87.18,True


In [106]:
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = data.select_dtypes(include=['object', 'bool']).columns

In [107]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(x, y, random_state=0)

In [108]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [109]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [155]:
lr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', LinearRegression())])

In [156]:
lr.fit(X_train, y_train) # результат линейной регрессии 
print("model score: %.3f" % lr.score(X_test, y_test))

model score: 0.506


In [130]:
from sklearn.linear_model import LogisticRegression

In [157]:
lr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', LogisticRegression())])

In [158]:
lr.fit(X_train, y_train) # 
print("model score: %.3f" % lr.score(X_test, y_test))

model score: 0.836


In [141]:
from sklearn.linear_model import SGDClassifier

In [142]:
lr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', SGDClassifier())])

In [143]:
lr.fit(X_train, y_train) # Линейная классификация  
print("model score: %.3f" % lr.score(X_test, y_test))

model score: 0.756


In [144]:
from sklearn.tree import DecisionTreeClassifier

In [145]:
lr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', DecisionTreeClassifier())])

In [146]:
lr.fit(X_train, y_train) # дерево  
print("model score: %.3f" % lr.score(X_test, y_test))

model score: 0.832


In [148]:
from sklearn.svm import SVC

In [149]:
lr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', SVC())])

In [150]:
lr.fit(X_train, y_train) # Опорные вектора   
print("model score: %.3f" % lr.score(X_test, y_test))

model score: 0.884
