In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import lightgbm as lgbm
import tensorflow as tf
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')
import sys
np.set_printoptions(threshold=sys.maxsize)

In [2]:
df = pd.read_csv("train_dataset_Самолет.csv")

# Feature engineering

In [3]:
def do_lda(df, lda = LinearDiscriminantAnalysis(n_components=None)):    
    X = df.drop(columns=['target']).fillna(0)   
    y = df['target']    
    X_lda = pd.DataFrame(data=lda.fit_transform(X, y), columns=['LDA_Component'])
    df['LDA'] = pd.DataFrame(X_lda)
    return df


In [4]:
def apply_pca(df, target_col_name, n_components):
    target = df[target_col_name]
    data = df.drop(columns=[target_col_name])

    pca = PCA(n_components=n_components)

    pca_result = pca.fit_transform(data)

    pca_df = pd.DataFrame(data=pca_result, columns=[f'PC{i}' for i in range(1, n_components + 1)])

    pca_df[target_col_name] = target.values

    return pca_df



In [5]:
def add_inverse_and_interactions(df, target_column = "target"):
    eps = 0.00001

    new_df = df.copy()
    
    # Добавляем обратные величины признаков
    for column in df.columns:
        if column != target_column:
            new_column_name = f"Inverse_{column}"
            new_df[new_column_name] = 1 / (df[column]+eps)
    
    new_df2 = new_df.copy()
    # Добавляем попарные произведения признаков
    for i in range(len(new_df.columns)):
        for j in range(i + 1, len(new_df.columns)):
            if new_df.columns[i] != target_column and new_df.columns[j] != target_column:
                new_column_name = f"{new_df.columns[i]}_{new_df.columns[j]}_Product"
                new_df2[new_column_name] = new_df[new_df.columns[i]] * new_df[new_df.columns[j]]
    
    return new_df2

In [6]:
def get_best_features(df, k_features, train_sz=14455):
    X = df.drop(columns=['target'])
    y = df["target"]

    model = CatBoostClassifier(iterations=300, depth=6, learning_rate=0.1, random_state=42, verbose=0)
    model.fit(X.head(train_sz), y.head(train_sz))
    feature_importance = model.get_feature_importance()
    order = np.flip(np.argsort(np.array(feature_importance)))
    best_features = list(np.array(df.columns)[order])[:k_features]

    with open ('best_features.txt', 'w') as f:
        for l in list(zip(list(np.array(df.columns)[order]), feature_importance[order])):
            f.write(str(l[0])+' '+str(l[1])+'\n')

    if not ('target' in best_features):
            best_features.append('target')
            
    return df[best_features]

from xgboost import XGBClassifier
def get_best_features_xgboost(df, k_features):
    X = df.drop(columns=['target'])
    y = df["target"]  

    model = XGBClassifier()
    model.fit(X, y)

    feature_importances = model.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    selected_features = feature_importance_df['Feature'].head(k_features).tolist()
    df = df[selected_features + ['target']] 

    with open ('best_features.txt', 'w') as f:
        for l in list(zip(feature_importance_df['Feature'].tolist(), feature_importance_df['Importance'].tolist())):
            f.write(str(l[0])+' '+str(l[1])+'\n')

    return df


import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
def get_best_features_sklearn(df, k_features):
    X = df.drop(columns=['target']).fillna(0)
    y = df["target"] 

    selector = SelectKBest(score_func=f_classif, k=k_features)
    X_new = selector.fit_transform(X, y)
    selected_columns = list(X.columns[selector.get_support()])
    df_new = df[selected_columns + ['target']]

    with open ('best_features.txt', 'w') as f:
        for l in selected_columns:
            f.write(l+'\n')

    return df_new

def get_pass(df, k_features):               
    return df

In [7]:
def get_df(k_features, k_features_preprocess  = 30, verbose = 1, train_sz=14455):
    nan_percent = 60
    zero_or_nan_percent = 95
    
    df0 = df.drop(columns=['col1454', 'report_date', 'client_id'])
    df0_train = df0.head(train_sz)
    nans = np.array((df0_train.isna()).sum()/df0_train.shape[0]*100.0).round()
    if verbose>0:
        print(f'Удаляем столбцы, где хотя бы {nan_percent}% NaN. Количество до: {nans.shape}', f'количество после:{nans[nans<nan_percent].shape}')
    df1 = df0[df0.columns[nans<nan_percent]]
    df1_train = df1.head(train_sz)
    nz = np.array(((df1_train==0) | (df1_train.isna())).sum()/df0_train.shape[0]*100.0).round()
    if verbose>0:
        print(f'Удаляем столбцы, где {zero_or_nan_percent}% - нули или NaN. Количество до: {nz.shape}', f'количество после:{nz[nz<zero_or_nan_percent].shape}')
    df2 = df1[df1.columns[nz<zero_or_nan_percent]]
    df2['target'] = df['target']
    
    
    #df3 = add_inverse_and_interactions(do_lda(get_best_features_xgboost(df2, k_features_preprocess)))
    final_df = get_best_features(df2, k_features)
    if verbose>0:
        print("data preprocessing done")
    return final_df

# Models

In [8]:
class model:
    def __init__(self, model_params):
        self.model_params = model_params
    def fit(self, X, y):
        pass
    def predict(self, X):
        return np.zeros(X.shape[0])
    def score(self, X, y):
        return 0

In [9]:
class xgb_model(model):
    def __init__(self, model_params):
        self.model_params = model_params
    def fit(self, X, y):
        dtrain = xgb.DMatrix(X, label=y)
        self.model = xgb.train(**self.model_params, dtrain=dtrain)
    def predict(self, X):
        dval = xgb.DMatrix(X)
        return self.model.predict(dval, ntree_limit=self.model.best_iteration)
    def score(self, X, y):
        y_pred = self.predict(X)
        return roc_auc_score(y, y_pred)

In [10]:
class lgbm_model(model):
    def __init__(self, model_params):
        self.model_params = model_params
    def fit(self, X, y):
        self.model = lgbm.LGBMRegressor(**self.model_params).fit(X, y)
    def predict(self, X):
        return self.model.predict(X)
    def score(self, X, y):
        y_pred = self.predict(X)
        return roc_auc_score(y, y_pred)

In [11]:
class rf_model(model):
    def __init__(self, model_params):
        self.model_params = model_params
    def fit(self, X, y):
        self.model = RandomForestClassifier(**self.model_params)
        self.model.fit(X, y)
    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]
    def score(self, X, y):
        y_pred = self.predict(X)
        return roc_auc_score(y, y_pred)

In [12]:
class cat_model(model):
    def __init__(self, model_params):
        self.model_params = model_params
    def fit(self, X, y):
        self.model = CatBoostClassifier(**self.model_params)
        self.model.fit(X, y)
    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]
    def score(self, X, y):
        y_pred = self.predict(X)
        return roc_auc_score(y, y_pred)

In [13]:
class nn_model(model):
    def __init__(self, model_params):
        self.model_params = dict(model_params)
        self.shape=self.model_params['shape']
        del self.model_params['shape']
    def fit(self, X, y):
        self.model = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation='relu', input_shape=self.shape),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(16, activation='relu'),
            tf.keras.layers.Dense(1, activation='sigmoid')
            ])
        self.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), **self.model_params)
        
        self.model.fit(X, y, epochs=100, batch_size=512, verbose=0)
    def predict(self, X):
        return self.model.predict(X, verbose=0)
    def score(self, X, y):
        y_pred = self.predict(X)
        return roc_auc_score(y, y_pred)

In [14]:
class cross_val_model:
    def __init__(self, model, model_params, n_splits = 5, random_state = 42):
        self.models = [model(model_params) for i in range(n_splits)]
        self.n_splits = n_splits
        self.skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    def fit(self, X, y):
        i = 0
        roc_auc_scores = []
        for train_idx, val_idx in self.skf.split(X, y):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            self.models[i].fit(X_train, y_train)

            roc_auc = self.models[i].score(X_val, y_val)
            roc_auc_scores.append(roc_auc)
            i+=1
        
        return np.mean(roc_auc_scores), np.min(roc_auc_scores)

In [15]:
n_splits=5
random_state=42

# Model tests

In [16]:
dataset = get_df(60)

X = dataset.drop('target', axis=1)
y = dataset['target']

Удаляем столбцы, где хотя бы 60% NaN. Количество до: (2663,) количество после:(352,)
Удаляем столбцы, где 95% - нули или NaN. Количество до: (352,) количество после:(121,)
data preprocessing done


In [17]:
cat_params = {
    'iterations':800,
    'depth':9,
    'learning_rate':0.1,
    'random_state':42,
    'verbose':0
}
cat_dataset = {
    'size':60,
    'norm':False
}

md = cross_val_model(cat_model, cat_params, n_splits, random_state)
print("Mean and lowest scores for CatBoost:",md.fit(X, y))

Mean and lowest scores for CatBoost: (0.9664088900466121, 0.9532359355272435)


In [18]:
xgb_params = {'params':{
        'objective': 'binary:logistic',
        'max_depth': 9,
        'learning_rate': 0.03,
        'seed':42,
         'reg_lambda': 0.12,
    },
    'num_boost_round':1000}
xgb_dataset = {
    'size':60,
    'norm':False
}


md = cross_val_model(xgb_model, xgb_params, n_splits, random_state)
print("Mean and lowest scores for XGBoost:",md.fit(X, y))

Mean and lowest scores for XGBoost: (0.9719566073693798, 0.9558758423926411)


In [19]:
dataset = get_df(100)

X = dataset.drop('target', axis=1)
y = dataset['target']

Удаляем столбцы, где хотя бы 60% NaN. Количество до: (2663,) количество после:(352,)
Удаляем столбцы, где 95% - нули или NaN. Количество до: (352,) количество после:(121,)
data preprocessing done


In [20]:
lgbm_params = {
        'n_estimators': 1000,
        'learning_rate': 0.1,
        'verbose': -1,
    }
lgbm_dataset = {
    'size':100,
    'norm':False
}

md = cross_val_model(lgbm_model, lgbm_params, n_splits, random_state)
print("Mean and lowest scores for LiteGBM:",md.fit(X, y))

Mean and lowest scores for LiteGBM: (0.957596458708786, 0.9478905340482398)


In [21]:
dataset = get_df(80).fillna(-1)

X = dataset.drop('target', axis=1)
y = dataset['target']
X = (X - X.mean()) / X.std()

Удаляем столбцы, где хотя бы 60% NaN. Количество до: (2663,) количество после:(352,)
Удаляем столбцы, где 95% - нули или NaN. Количество до: (352,) количество после:(121,)
data preprocessing done


In [22]:
nn_params = {
    'loss': 'binary_crossentropy', 
    'metrics':['accuracy'],
    'shape':(X.shape[1],)
}
nn_dataset = {
    'size':80,
    'norm':True
}

md = cross_val_model(nn_model, nn_params, n_splits, random_state)
print("Mean and lowest scores for Neural Network:",md.fit(X, y))

Mean and lowest scores for Neural Network: (0.882550491386391, 0.816243625690721)


In [23]:
rf_params = {
    'criterion': 'entropy',
    'random_state': 42,
    'n_estimators': 2000
}
rf_dataset = {
    'size':80,
    'norm':True
}

md = cross_val_model(rf_model, rf_params, n_splits, random_state)
print("Mean and lowest scores for Random Forest:",md.fit(X, y))

Mean and lowest scores for Random Forest: (0.9745959692229491, 0.9581591158771541)


# Model stacking and creating the final model

In [24]:
def prepare_data(size=60, norm=False):
    dataset = get_df(size, verbose=0)
    if norm:
        dataset = dataset.fillna(-1)
    X = dataset.drop('target', axis=1)
    y = dataset['target']
    if norm:
        X = (X - X.mean()) / X.std()
    return X, y

In [25]:
class master_model:
    def __init__(self):
        self.models = [xgb_model(xgb_params), lgbm_model(lgbm_params), rf_model(rf_params)]
        self.dataset_params = [xgb_dataset, lgbm_dataset, rf_dataset]
        self.max_size = 0
        self.n = 3
        for i in range(self.n):
            self.max_size = max(self.max_size, self.dataset_params[i]['size'])
    
    def subset(self, X, Xnorm, i):
        #print(X.columns)
        if self.dataset_params[i]['norm']:
            return Xnorm.iloc[:,:self.dataset_params[i]['size']]
        else:
            return X.iloc[:,:self.dataset_params[i]['size']]
    
    def fit_submodels(self, X, Xnorm, y):
        for i in range(self.n):
            self.models[i].fit(self.subset(X, Xnorm, i), y)
            #print("submodel", i, "fit")
    
    def fit(self, X, Xnorm, y, model_params):
        data = pd.DataFrame()
        for i in range(self.n):
            data[i] = self.models[i].predict(self.subset(X, Xnorm, i))
        self.model = CatBoostClassifier(**model_params)
        self.model.fit(data, y)
        
    def predict_with_submodels(self, X, Xnorm):
        data = pd.DataFrame()
        for i in range(self.n):
            data[i] = self.models[i].predict(self.subset(X, Xnorm, i))
        return data
    
    def predict(self, X, Xnorm):
        data = self.predict_with_submodels(X, Xnorm)
        return self.model.predict_proba(data)[:, 1]
    
    def score_with_submodels(self, X, Xnorm, y):
        y_pred = self.predict_with_submodels(X, Xnorm)
        return [roc_auc_score(y, y_pred.iloc[:, i]) for i in range(self.n)]
    
    def score(self, X, Xnorm, y):
        y_pred = self.predict(X, Xnorm)
        return roc_auc_score(y, y_pred)

In [26]:
class cross_val_master_model:
    def __init__(self, n_splits = 5, random_state = 42):
        self.models = [master_model() for i in range(n_splits)]
        self.skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        
    def fit_submodels(self):
        X, y = prepare_data(self.models[0].max_size, False)
        Xnorm, _ = prepare_data(self.models[0].max_size, True)
        #print("data prepared")
        i = 0
        roc_auc_scores = []
        for train_idx, val_idx in self.skf.split(X, y):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            Xnorm_train, Xnorm_val = Xnorm.iloc[train_idx], Xnorm.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            self.models[i].fit_submodels(X_train, Xnorm_train, y_train)
            #print("submodels fit")

            roc_auc = self.models[i].score_with_submodels(X_val, Xnorm_val, y_val)
            roc_auc_scores.append(roc_auc)
            i+=1

        return roc_auc_scores
    
    def fit_model(self, params):
        X, y = prepare_data(self.models[0].max_size, False)
        Xnorm, _ = prepare_data(self.models[0].max_size, True)
        #print("data prepared")
        i = 0
        roc_auc_scores = []
        for train_idx, val_idx in self.skf.split(X, y):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            Xnorm_train, Xnorm_val = Xnorm.iloc[train_idx], Xnorm.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            self.models[i].fit(X_train, Xnorm_train, y_train, params)
            #print("model fit")

            roc_auc = self.models[i].score(X_val, Xnorm_val, y_val)
            roc_auc_scores.append(roc_auc)
            i+=1

        return np.mean(roc_auc_scores), np.min(roc_auc_scores)

In [27]:
md = cross_val_master_model()
md.fit_submodels()
pass

In [28]:
master_params = {
    'iterations':200,
    'depth':6,
    'learning_rate':0.1,
    'random_state':41,
    'verbose':0
}


print("Mean and lowest scores for the full model:",md.fit_model(master_params))

Mean and lowest scores for the full model: (0.9751173545554417, 0.9581652647285488)


## Forming submission

In [29]:
df = pd.read_csv("train_dataset_Самолет.csv")
train_sz = df.shape[0]
df2 = pd.read_csv("test.csv", sep=';')
test_sz = df2.shape[0]
ids = df2['id']
df = pd.concat([df, df2])

In [30]:
md = master_model()

X, y = prepare_data(md.max_size, False)
Xnorm, _ = prepare_data(md.max_size, True)

X_train, X_test = X.head(train_sz), X.tail(test_sz)
Xnorm_train, Xnorm_test = Xnorm.head(train_sz), Xnorm.tail(test_sz)
y_train = y.head(train_sz)

In [31]:
md.fit_submodels(X_train, Xnorm_train, y_train)
md.fit(X_train, Xnorm_train, y_train, master_params)

In [32]:
pred = md.predict(X_test, Xnorm_test)
df2['target'] = pred
df2[['id', 'target']].to_csv('result.csv', index=False, sep=';')