In [None]:
import pandas
import numpy as np
import io
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.model_selection import train_test_split

data = pandas.read_csv('kc_house_data.csv', index_col = 'id')
data_sel = data.loc[:,data.columns.isin(['price', 'bedrooms', 'bathrooms','sqft_living','sqft_lot','floors','waterfront','view'])]
data_sel = data_sel.dropna()
data_sel['bedrooms'] = np.where(data['bedrooms'] <= 3, 0, 1)

Bedrooms = data_sel.loc[:,data_sel.columns.isin(['bedrooms'])]
X = data_sel.loc[:,data_sel.columns.isin(['price', 'bathrooms','sqft_living','sqft_lot','floors','waterfront','view'])]

x_train, x_test, y_train, y_test = train_test_split(X, Bedrooms,test_size = 0.3)

In [None]:
classifier = LogisticRegression(random_state = 0, solver = 'lbfgs', multi_class = 'multinomial', penalty = 'l2')#, penalty = 'elasticnet', l1_ration = '0.5' #'saga' 
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)
classifier.score(x_test, y_test)

In [None]:
print(len(x_test))
print(len(y_test))
print(sum(y_test['bedrooms']))
print(sum(y_pred))
print(np.where(y_test == 1)[0])

In [None]:
TP = 0
FP = 0
FN = 0
TN = 0
for i in np.where(y_test == 1)[0]:
    if y_pred[i] == 1:
        TP+=1
    else:
        FN+=1
for i in np.where(y_test == 0)[0]:
    if y_pred[i] == 1:
        FP+=1
    else:
        TN+=1
print(TP,FP,FN, TN)
precision = TP / (TP + FP)
recall = TP/(TP + FN)
F1 = 2* (precision*recall)/(precision+recall)
print(precision, recall,F1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators':[100,150,200],'max_features':['auto'],'max_depth': list(range(1,20))} #лучшая модель судя по метрикам, по крайней мере была...
#0.7431 0.737996 0.7405
forest = GridSearchCV(estimator = RandomForestClassifier(),param_grid = param_grid,cv = 5, refit = True)
forest.fit(x_train, y_train)

In [None]:
y_pred = forest.predict(x_test)
print(y_pred)

In [None]:
TP = 0
FP = 0
FN = 0
TN = 0
for i in np.where(y_test == 1)[0]:
    if y_pred[i] == 1:
        TP+=1
    else:
        FN+=1
for i in np.where(y_test == 0)[0]:
    if y_pred[i] == 1:
        FP+=1
    else:
        TN+=1
print(TP,FP,FN, TN)
precision = TP / (TP + FP)
recall = TP/(TP + FN)
F1 = 2* (precision*recall)/(precision+recall)
print(precision, recall,F1)

In [None]:
param_grid = {'n_estimators':[200,350,400],'max_features':['auto'],'max_depth': list(range(1,20))}

forest = GridSearchCV(estimator = RandomForestClassifier(),param_grid = param_grid,cv = 5, refit = True)
forest.fit(x_train, y_train)
y_pred = forest.predict(x_test)
print(y_pred)
TP = 0
FP = 0
FN = 0
TN = 0
for i in np.where(y_test == 1)[0]:
    if y_pred[i] == 1:
        TP+=1
    else:
        FN+=1
for i in np.where(y_test == 0)[0]:
    if y_pred[i] == 1:
        FP+=1
    else:
        TN+=1
print(TP,FP,FN, TN)
precision = TP / (TP + FP)
recall = TP/(TP + FN)
F1 = 2* (precision*recall)/(precision+recall)
print(precision, recall,F1)

In [None]:
param_grid = {'n_estimators':[50,100,150],'max_features':['auto'],'max_depth': list(range(1,20))}

forest = GridSearchCV(estimator = RandomForestClassifier(),param_grid = param_grid,cv = 5, refit = True)
forest.fit(x_train, y_train)

In [None]:
y_pred = forest.predict(x_test)
print(y_pred)
TP = 0
FP = 0
FN = 0
TN = 0
for i in np.where(y_test == 1)[0]:
    if y_pred[i] == 1:
        TP+=1
    else:
        FN+=1
for i in np.where(y_test == 0)[0]:
    if y_pred[i] == 1:
        FP+=1
    else:
        TN+=1
print(TP,FP,FN, TN)
precision = TP / (TP + FP)
recall = TP/(TP + FN)
F1 = 2* (precision*recall)/(precision+recall)
print(precision, recall,F1)

In [None]:
for i in range(10):
    
    param_grid = {'n_estimators':[100 + 10*i,100 + 10*i+10,100 + 10*i + 20],'max_features':['auto'],'max_depth': list(range(1,20))}

    forest = GridSearchCV(estimator = RandomForestClassifier(),param_grid = param_grid,cv = 5, refit = True)
    forest.fit(x_train, y_train)

    y_pred = forest.predict(x_test)
    print([100 + 10*i,100 + 10*i+10,100 + 10*i + 20])
    print(y_pred)

    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for i in np.where(y_test == 1)[0]:
        if y_pred[i] == 1:
            TP+=1
        else:
            FN+=1
    for i in np.where(y_test == 0)[0]:
        if y_pred[i] == 1:
            FP+=1
        else:
            TN+=1
    print(TP,FP,FN, TN)
    precision = TP / (TP + FP)
    recall = TP/(TP + FN)
    F1 = 2* (precision*recall)/(precision+recall)
    print(precision, recall,F1)

In [None]:
for i in range(5):
    
    param_grid = {'n_estimators':[100 - 10*i,150 - 10*i,200 - 10*i],'max_features':['auto'],'max_depth': list(range(1,20))}

    forest = GridSearchCV(estimator = RandomForestClassifier(),param_grid = param_grid,cv = 5, refit = True)
    forest.fit(x_train, y_train)

    y_pred = forest.predict(x_test)
    print([100 - 10*i,100 - 10*i+10,100 - 10*i + 20])
    print(y_pred)

    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for i in np.where(y_test == 1)[0]:
        if y_pred[i] == 1:
            TP+=1
        else:
            FN+=1
    for i in np.where(y_test == 0)[0]:
        if y_pred[i] == 1:
            FP+=1
        else:
            TN+=1
    print(TP,FP,FN, TN)
    precision = TP / (TP + FP)
    recall = TP/(TP + FN)
    F1 = 2* (precision*recall)/(precision+recall)
    print(precision, recall,F1)

In [None]:
for i in range(3):
    
    param_grid = {'n_estimators':[100 + 10*i,150 + 10*i,200 + 10*i],'max_features':['auto'],'max_depth': list(range(1,20))}

    forest = GridSearchCV(estimator = RandomForestClassifier(),param_grid = param_grid,cv = 5, refit = True)
    forest.fit(x_train, y_train)

    y_pred = forest.predict(x_test)
    print([100 - 10*i,100 - 10*i+10,100 - 10*i + 20])
    print(y_pred)

    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for i in np.where(y_test == 1)[0]:
        if y_pred[i] == 1:
            TP+=1
        else:
            FN+=1
    for i in np.where(y_test == 0)[0]:
        if y_pred[i] == 1:
            FP+=1
        else:
            TN+=1
    print(TP,FP,FN, TN)
    precision = TP / (TP + FP)
    recall = TP/(TP + FN)
    F1 = 2* (precision*recall)/(precision+recall)
    print(precision, recall,F1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

for i in [100,150,200]:
    
    param_grid = {'n_estimators':[i],'max_features':['auto'],'max_depth': list(range(1,20))}

    forest = GridSearchCV(estimator = RandomForestClassifier(),param_grid = param_grid,cv = 5, refit = True)
    forest.fit(x_train, y_train)

    y_pred = forest.predict(x_test)
    print(i)
    print(y_pred)

    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for i in np.where(y_test == 1)[0]:
        if y_pred[i] == 1:
            TP+=1
        else:
            FN+=1
    for i in np.where(y_test == 0)[0]:
        if y_pred[i] == 1:
            FP+=1
        else:
            TN+=1
    print(TP,FP,FN, TN)
    precision = TP / (TP + FP)
    recall = TP/(TP + FN)
    F1 = 2* (precision*recall)/(precision+recall)
    print(precision, recall,F1)