In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
data_train=pd.read_csv("data/train.csv")
data_test=pd.read_csv("data/test.csv")
data_sample=pd.read_csv("data/sample.csv")

In [4]:
data_train["Liver_disease"]

0      1
1      1
2      1
3      1
4      1
      ..
388    1
389    2
390    1
391    1
392    1
Name: Liver_disease, Length: 393, dtype: int64

In [5]:
data_sample["Liver_disease"].value_counts()

1    165
2     25
Name: Liver_disease, dtype: int64

In [6]:
data_train

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_disease,Id
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90,1,0
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1,2
3,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1,3
4,26,Female,0.9,0.2,154,16,12,7.0,3.5,1.00,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...
388,32,Male,12.7,8.4,190,28,47,5.4,2.6,0.90,1,388
389,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2,389
390,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1,390
391,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1,391


In [7]:
def prepare_data(data,scaler=None,mode='train'):
    #убираем лишние
    data=data.dropna()
    data=data.drop(["Id"],axis=1)
    data=pd.get_dummies(data,columns=['Gender'])
    #разбиение на х и у 
    X=data.values
    if mode=='train':
        y=data["Liver_disease"].values
        X=data.drop(["Liver_disease"],axis=1).values
    #нормализация
    if scaler==None:
        scaler = MinMaxScaler()
        scaler.fit(X)
    data=scaler.transform(X)
    #биас
    X = np.hstack([np.ones(X.shape[0])[:, np.newaxis], X])
    
    
    
    
    #разбиение на вборки
    if mode=='train':
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1)
        return X_train, X_valid, y_train, y_valid,scaler,X,y
    return X
def print_logisitc_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f'acc = {acc:.2f} F1-score = {f1:.2f}')

In [8]:
X_train, X_valid, y_train, y_valid,scaler,X,y=prepare_data(data_train,scaler=None)

In [9]:
clf = RandomForestClassifier(max_depth=5,n_estimators=70)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_valid)

In [10]:
clf = LogisticRegression(random_state=0).fit(X_train,y_train)
y_pred=clf.predict(X_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
print_logisitc_metrics(y_valid, y_pred)

acc = 0.68 F1-score = 0.79


Теперь переобучим модель на всей доступной тренировочной выборке

In [12]:
clf = RandomForestClassifier(max_depth=5,n_estimators=70)
clf.fit(X,y)


подготовим тестовый датасет

In [13]:
X=prepare_data(data_test,scaler=scaler,mode='test')

In [14]:
prediction=clf.predict(X)

In [15]:
data_sample

Unnamed: 0,Id,Liver_disease
0,393,1
1,394,1
2,395,1
3,396,1
4,397,1
...,...,...
185,578,1
186,579,1
187,580,1
188,581,1


In [16]:
data_test

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Id
0,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,393
1,46,Male,1.8,0.7,208,19,14,7.6,4.4,1.30,394
2,29,Female,0.9,0.3,202,14,11,6.7,3.6,1.10,395
3,38,Male,1.8,0.8,342,168,441,7.6,4.4,1.30,396
4,63,Male,0.9,0.2,194,52,45,6.0,3.9,1.85,397
...,...,...,...,...,...,...,...,...,...,...,...
185,16,Male,2.6,1.2,236,131,90,5.4,2.6,0.90,578
186,90,Male,1.1,0.3,215,46,134,6.9,3.0,0.70,579
187,32,Male,15.6,9.5,134,54,125,5.6,4.0,2.50,580
188,32,Male,12.1,6.0,515,48,92,6.6,2.4,0.50,581


In [17]:
isBug=pd.Series(y_pred)

In [18]:
isBug.value_counts()

1    66
2    12
dtype: int64

In [19]:
data_sample["Liver_disease"]=prediction


In [20]:
data_sample["Liver_disease"].value_counts()

1    176
2     14
Name: Liver_disease, dtype: int64

In [21]:
submit=data_sample

In [22]:
submit.to_csv (r'submit.csv', index= False )

In [23]:
pd.read_csv("submit.csv")["Liver_disease"].value_counts()

1    176
2     14
Name: Liver_disease, dtype: int64

In [24]:
pd.read_csv("submit.csv")

Unnamed: 0,Id,Liver_disease
0,393,1
1,394,1
2,395,2
3,396,1
4,397,1
...,...,...
185,578,1
186,579,1
187,580,1
188,581,1


In [25]:
pd.read_csv("data/sample.csv")

Unnamed: 0,Id,Liver_disease
0,393,1
1,394,1
2,395,1
3,396,1
4,397,1
...,...,...
185,578,1
186,579,1
187,580,1
188,581,1
