In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

In [5]:
boston = load_boston()

In [7]:
data = boston.data

In [8]:
target = boston.target

In [9]:
feature_names = boston.feature_names

In [11]:
X = pd.DataFrame(data, columns=feature_names)
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [12]:
y = pd.DataFrame(target, columns=['price'])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
lr = LinearRegression()

In [16]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [17]:
y_pred = lr.predict(X_test)

In [18]:
from sklearn.metrics import r2_score

In [19]:
r2_score(y_test, y_pred)

0.7109203586326274

# Задача 2

In [21]:
from sklearn.ensemble import RandomForestRegressor

In [22]:
rfr = RandomForestRegressor(max_depth=12, random_state=42, n_estimators=1000)

In [23]:
reg = rfr.fit(X_train, y_train.values[:,0])

In [24]:
y_pred = rfr.predict(X_test)

In [25]:
r2_score(y_test, y_pred)

0.8758073546581215

В данном случае лучше работает модель RandomForestRegressor (R2=0.8758073546581215)

# Задание 3

In [26]:
importance_flags = reg.feature_importances_

In [27]:
importance_flags_sum = sum(importance_flags)
importance_flags_sum

0.9999999999999991

# Задание 4

In [28]:
df = pd.read_csv("creditcard.csv")

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [30]:
pd.options.display.max_columns = 100

In [31]:
df['Class'].value_counts(normalize=True)

0    0.998273
1    0.001727
Name: Class, dtype: float64

In [32]:
X = df[df.columns[df.columns != 'Class']]
y = df["Class"]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

In [36]:
parameters = [{'n_estimators': [10, 15], 
               'max_features': np.arange(3, 5),
               'max_depth': np.arange(4, 7)}]

In [37]:
clf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=100), 
    param_grid=parameters,
    scoring='roc_auc',
    cv=3)

In [38]:
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=100, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [10, 15], 'max_features': array([3, 4]), 'max_depth': array([4, 5, 6])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [39]:
clf.best_params_

{'max_depth': 6, 'max_features': 3, 'n_estimators': 15}

In [40]:
y_predict = clf.predict_proba(X_test)

In [41]:
y_pred_proba = y_predict[:, 1]

In [42]:
from sklearn.metrics import roc_auc_score

In [43]:
AUC = roc_auc_score(y_test, y_pred_proba)
AUC

0.9462664156037157