In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import seaborn as sns
from matplotlib import pyplot as plt

## Gathering all data into one dataframe, sort it by date and resplit like KFold by date

In [2]:
training_csv = pd.read_csv("DontGetKicked/training.csv")
test_csv = pd.read_csv("DontGetKicked/test.csv")

complete_csv = pd.concat([training_csv, test_csv], ignore_index=True)
complete_csv['PurchDate'] = pd.to_datetime(complete_csv['PurchDate'], format='%m/%d/%Y')
sorted_csv = complete_csv.sort_values(by=["PurchDate"])

sorted_csv.drop(columns=["PurchDate", "RefId"], inplace=True)

In [3]:
sorted_csv.head(1)

Unnamed: 0,IsBadBuy,Auction,VehYear,VehicleAge,Make,Model,Trim,SubModel,Color,Transmission,...,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,PRIMEUNIT,AUCGUART,BYRNO,VNZIP1,VNST,VehBCost,IsOnlineSale,WarrantyCost
32366,0.0,MANHEIM,2005,4,CHRYSLER,SEBRING V6 2.7L V6 M,Tou,4D SEDAN LXI,WHITE,AUTO,...,6045.0,7561.0,,,22916,80022,CO,5850.0,0,905


In [4]:
sorted_csv.columns

Index(['IsBadBuy', 'Auction', 'VehYear', 'VehicleAge', 'Make', 'Model', 'Trim',
       'SubModel', 'Color', 'Transmission', 'WheelTypeID', 'WheelType',
       'VehOdo', 'Nationality', 'Size', 'TopThreeAmericanName',
       'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice',
       'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',
       'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
       'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice',
       'PRIMEUNIT', 'AUCGUART', 'BYRNO', 'VNZIP1', 'VNST', 'VehBCost',
       'IsOnlineSale', 'WarrantyCost'],
      dtype='object')

In [5]:
categorical_columns = sorted_csv.select_dtypes(include=["object", "category"]).columns
categorical_columns

Index(['Auction', 'Make', 'Model', 'Trim', 'SubModel', 'Color', 'Transmission',
       'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'PRIMEUNIT',
       'AUCGUART', 'VNST'],
      dtype='object')

In [6]:
from sklearn.preprocessing import OneHotEncoder

pd.get_dummies(sorted_csv, columns=categorical_columns, drop_first=True).head(1)

Unnamed: 0,IsBadBuy,VehYear,VehicleAge,WheelTypeID,VehOdo,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,...,VNST_OR,VNST_PA,VNST_SC,VNST_TN,VNST_TX,VNST_UT,VNST_VA,VNST_WA,VNST_WI,VNST_WV
32366,0.0,2005,4,2.0,44634,4243.0,5530.0,5082.0,6472.0,5134.0,...,False,False,False,False,False,False,False,False,False,False


In [7]:
for col in categorical_columns:
    print(f"{col}: {sorted_csv[col].nunique()}")

Auction: 3
Make: 33
Model: 1130
Trim: 137
SubModel: 933
Color: 17
Transmission: 3
WheelType: 3
Nationality: 4
Size: 12
TopThreeAmericanName: 4
PRIMEUNIT: 2
AUCGUART: 2
VNST: 38


In [8]:
sorted_csv["Model"].value_counts()

Model
PT CRUISER              4142
IMPALA                  3270
TAURUS                  2377
CARAVAN GRAND FWD V6    2209
MALIBU 4C               2025
                        ... 
SX4 2.0L I4 EFI            1
RENDEZVOUS AWD 3.5L        1
MIATA MX-5 1.8L I4 E       1
MALIBU 4C 2.4L I4 MP       1
ARMADA 4WD V8              1
Name: count, Length: 1130, dtype: int64

## Уменьшаем выбросы и оставляем модели со встречаемостью от 50

In [9]:
model_counts = sorted_csv["Model"].value_counts()
frequent_models = model_counts[model_counts > 50].index
sorted_csv["Model"] = np.where(sorted_csv["Model"].isin(frequent_models), sorted_csv["Model"], "Other")
sorted_csv["Model"].value_counts()

Model
Other                   8628
PT CRUISER              4142
IMPALA                  3270
TAURUS                  2377
CARAVAN GRAND FWD V6    2209
                        ... 
ALERO V6 3.4L V6 MPI      52
COBALT 2.2L I4 MPI /      51
MONTEGO 3.0L V6 EFI       51
LE SABRE 3.8L V6 MFI      51
BEETLE                    51
Name: count, Length: 316, dtype: int64

## Идентично для SubModel

In [10]:
sorted_csv["SubModel"].value_counts()

SubModel
4D SEDAN                          25445
4D SEDAN LS                        7723
4D SEDAN SE                        6422
4D WAGON                           3062
MINIVAN 3.3L                       2099
                                  ...  
CREW CAB 3.4L PRERUNNER               1
EXT CAB 5.6L XE                       1
2D EXT CAB 2.5L                       1
4D SUV 6.8L XLT                       1
2D EXT CAB 3.0L DUAL SPORT FFV        1
Name: count, Length: 933, dtype: int64

In [11]:
submodel_counts = sorted_csv["SubModel"].value_counts()
frequent_submodels = submodel_counts[submodel_counts > 50].index
filtered_csv = sorted_csv[sorted_csv["SubModel"].isin(frequent_submodels)]
filtered_csv["SubModel"].value_counts().head(300)

SubModel
4D SEDAN              25445
4D SEDAN LS            7723
4D SEDAN SE            6422
4D WAGON               3062
MINIVAN 3.3L           2099
                      ...  
2D CONVERTIBLE GTC       53
4D SUV 4.6L XLS          52
4D WAGON S               52
4D SUV SPORT 2.4L        52
4D SUV 4.7L SXT          52
Name: count, Length: 237, dtype: int64

In [12]:
for col in categorical_columns:
    print(f"{col}: {filtered_csv[col].nunique()}")

Auction: 3
Make: 32
Model: 315
Trim: 87
SubModel: 237
Color: 17
Transmission: 3
WheelType: 3
Nationality: 4
Size: 12
TopThreeAmericanName: 4
PRIMEUNIT: 2
AUCGUART: 2
VNST: 37


In [13]:
sorted_csv = pd.get_dummies(filtered_csv, columns=categorical_columns, drop_first=True)
sorted_csv.head(1)

Unnamed: 0,IsBadBuy,VehYear,VehicleAge,WheelTypeID,VehOdo,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,...,VNST_OK,VNST_OR,VNST_PA,VNST_SC,VNST_TN,VNST_TX,VNST_UT,VNST_VA,VNST_WA,VNST_WV
32366,0.0,2005,4,2.0,44634,4243.0,5530.0,5082.0,6472.0,5134.0,...,False,False,False,False,False,False,False,False,False,False


In [14]:
sorted_csv = sorted_csv[sorted_csv['IsBadBuy'].notnull()]

## СТАРЫБОХХХХХ ПОГНАЛИ БИТЬ МОДЕЛИ ЛИЦО НА КУСКИ

## чет много нулов. Выборка ты че

In [15]:
sorted_csv.isnull().sum()[sorted_csv.isnull().sum() > 0]

WheelTypeID                          2975
MMRAcquisitionAuctionAveragePrice      16
MMRAcquisitionAuctionCleanPrice        16
MMRAcquisitionRetailAveragePrice       16
MMRAcquisitonRetailCleanPrice          16
MMRCurrentAuctionAveragePrice         298
MMRCurrentAuctionCleanPrice           298
MMRCurrentRetailAveragePrice          298
MMRCurrentRetailCleanPrice            298
dtype: int64

In [16]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

sorted_csv = pd.DataFrame(imputer.fit_transform(sorted_csv), columns=sorted_csv.columns)

## бьем лицо по новой. Оставим выше код как гайд по работе с нулами в процессе

In [17]:
X = sorted_csv.drop(columns=["IsBadBuy"])
y = sorted_csv["IsBadBuy"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.66, random_state=21)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=21)

In [18]:
from sklearn.linear_model import LogisticRegression

## предупреждение? спасибо папаша. Гайз расширяемся

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=2000, class_weight="balanced"))
pipeline.fit(X_train, y_train)

In [20]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipeline, X_train, y_train, cv=5)

array([0.66595472, 0.65733818, 0.65520188, 0.64922025, 0.65819269])

In [21]:
cross_val_score(pipeline, X_train, y_train, cv=5, scoring="f1")

array([0.30673759, 0.28647687, 0.26502732, 0.27982456, 0.29824561])

In [22]:
def gini_score(auc):
    return 2 * auc - 1
from sklearn.metrics import roc_auc_score

In [23]:
y_pred = pipeline.predict_proba(X_valid)[:, 1]
auc = roc_auc_score(y_valid, y_pred)
gini = gini_score(auc)
print(f"AUC: {auc:.4f}, Gini: {gini:.4f}")

AUC: 0.6682, Gini: 0.3363


In [24]:
from sklearn.naive_bayes import GaussianNB

In [25]:
pipeline_nb = make_pipeline(StandardScaler(), GaussianNB())
pipeline_nb.fit(X_train, y_train)

In [26]:
y_pred_nb = pipeline_nb.predict_proba(X_valid)[:, 1]
auc_nb = roc_auc_score(y_valid, y_pred_nb)
gini_nb = gini_score(auc_nb)
print(f"AUC: {auc_nb:.4f}, Gini: {gini_nb:.4f}")

AUC: 0.5073, Gini: 0.0145


## не порядок. Применяем метод уменьшения размерности и оставляет +-30 важных векторов признаков

In [27]:
from sklearn.decomposition import PCA

pca = PCA(n_components=30)
X_pca = pca.fit_transform(X_train)
X_valid_pca = pca.transform(X_valid)

pipeline_pca = make_pipeline(StandardScaler(), GaussianNB())
pipeline_pca.fit(X_pca, y_train)
y_pred_pca = pipeline_pca.predict_proba(X_valid_pca)[:, 1]
auc_pca = roc_auc_score(y_valid, y_pred_pca)
gini_pca = gini_score(auc_pca)
print(f"AUC: {auc_pca:.4f}, Gini: {gini_pca:.4f}")

AUC: 0.6826, Gini: 0.3652


In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [29]:
pipeline_knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5))
pipeline_knn.fit(X_train, y_train)
y_pred_knn = pipeline_knn.predict_proba(X_valid)[:, 1]
auc_knn = roc_auc_score(y_valid, y_pred_knn)
gini_knn = gini_score(auc_knn)
print(f"AUC: {auc_knn:.4f}, Gini: {gini_knn:.4f}")

AUC: 0.5839, Gini: 0.1678


## MineLogisticRegression

In [30]:
class LogisticRegressionScratch:
    def __init__(self, lr=0.01, n_iter=1000, class_weight=None):
        self.lr = lr
        self.n_iter = n_iter
        self.class_weight = class_weight
        self.w = None
        self.b = 0
        self.weights_ = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)

        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)
        self.b = 0

        if self.class_weight == "balanced":
            from collections import Counter
            counter = Counter(y)
            total = len(y)
            weight_0 = total / (2 * counter[0])
            weight_1 = total / (2 * counter[1])
            self.weights_ = np.where(y == 0, weight_0, weight_1)
        else:
            self.weights_ = np.ones(n_samples)

        for _ in range(self.n_iter):
            for i in range(n_samples):
                xi = X[i]
                yi = y[i]
                weight = self.weights_[i]

                linear_output = np.dot(xi, self.w) + self.b
                y_pred = self.sigmoid(linear_output)

                error = y_pred - yi
                dw = weight * error * xi
                db = weight * error

                self.w -= self.lr * dw
                self.b -= self.lr * db

    def predict_proba(self, X):
        X = np.asarray(X)
        proba = self.sigmoid(np.dot(X, self.w) + self.b)
        return np.vstack([1 - proba, proba]).T

    def predict(self, X):
        proba = self.predict_proba(X)[:, 1]
        return (proba >= 0.5).astype(int)


In [31]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

model = LogisticRegressionScratch(class_weight="Balanced")
model.fit(X_train_scaled, y_train)

y_pred = model.predict_proba(X_valid_scaled)[:, 1]

In [32]:
auc = roc_auc_score(y_valid, y_pred)
gini = gini_score(auc)
print(f"AUC: {auc:.4f}, Gini: {gini:.4f}")

AUC: 0.5543, Gini: 0.1087


## MineKNN

In [33]:
from collections import Counter

class KNNClassifierScratch:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
        distances = np.linalg.norm(self.X_train - x, axis=1)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

    def predict_proba(self, X):
        probs = []
        for x in X:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train.iloc[k_indices]
            count = Counter(k_nearest_labels)
            prob_1 = count[1] / self.k
            probs.append([1 - prob_1, prob_1])
        return np.array(probs)


In [34]:
model = KNNClassifierScratch()
model.fit(X_train_scaled, y_train)

y_pred = model.predict_proba(X_valid_scaled)[:, 1]

In [35]:
auc = roc_auc_score(y_valid, y_pred)
gini = gini_score(auc)
print(f"AUC: {auc:.4f}, Gini: {gini:.4f}")

AUC: 0.5690, Gini: 0.1380


## MineGaussianNB

In [36]:
from scipy.stats import norm

class GaussianNBScratch:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.mean = {}
        self.var = {}
        self.priors = {}

        for cls in self.classes:
            X_c = X[y == cls]
            self.mean[cls] = X_c.mean(axis=0)
            self.var[cls] = X_c.var(axis=0) + 1e-9  # avoid /0
            self.priors[cls] = X_c.shape[0] / X.shape[0]

    def predict(self, X):
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
        posteriors = []

        for cls in self.classes:
            prior = np.log(self.priors[cls])
            class_conditional = np.sum(norm.logpdf(x, self.mean[cls], np.sqrt(self.var[cls])))
            posterior = prior + class_conditional
            posteriors.append(posterior)

        return self.classes[np.argmax(posteriors)]

    def predict_proba(self, X):
        probs = []
        for x in X:
            posteriors = []
            for cls in self.classes:
                prior = np.log(self.priors[cls])
                class_conditional = np.sum(norm.logpdf(x, self.mean[cls], np.sqrt(self.var[cls])))
                posteriors.append(prior + class_conditional)

            posteriors = np.exp(posteriors - np.max(posteriors))
            probs.append(posteriors / np.sum(posteriors))
        return np.array(probs)

In [37]:
model = GaussianNBScratch()
model.fit(X_train_scaled, y_train)

y_pred = model.predict_proba(X_valid_scaled)[:, 1]

In [38]:
auc = roc_auc_score(y_valid, y_pred)
gini = gini_score(auc)
print(f"AUC: {auc:.4f}, Gini: {gini:.4f}")

AUC: 0.5069, Gini: 0.0138


## Create non-liner features

In [39]:
X_train

Unnamed: 0,VehYear,VehicleAge,WheelTypeID,VehOdo,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,MMRCurrentAuctionCleanPrice,...,VNST_OK,VNST_OR,VNST_PA,VNST_SC,VNST_TN,VNST_TX,VNST_UT,VNST_VA,VNST_WA,VNST_WV
64735,2006.0,4.0,2.0,51952.0,4262.0,5476.0,7708.0,9309.0,4991.0,6428.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
53018,2004.0,6.0,2.0,23444.0,4325.0,6161.0,7571.0,9708.0,3824.0,5517.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
64999,2004.0,6.0,2.0,96283.0,5040.0,6116.0,9027.0,10130.0,5405.0,6464.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40474,2007.0,3.0,2.0,71831.0,8453.0,9721.0,11386.0,12969.0,8061.0,9303.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49550,2005.0,5.0,2.0,86125.0,4456.0,6207.0,7626.0,9796.0,4259.0,6073.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64241,2008.0,2.0,1.0,64595.0,7585.0,8439.0,11584.0,12801.0,8641.0,9569.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45666,2007.0,3.0,2.0,70384.0,6151.0,7231.0,9522.0,10245.0,5831.0,7118.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
42104,2007.0,3.0,1.0,55854.0,0.0,0.0,0.0,0.0,9039.0,10770.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63856,2005.0,5.0,1.0,92371.0,5131.0,6248.0,8490.0,9552.0,5526.0,7128.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
for X in [X_train, X_valid, X_test]:
    X['age_to_price'] = X["MMRAcquisitionAuctionAveragePrice"] / (X['VehicleAge'] + 1e-9)
    # normalize price to avoid below 0 somehow and above 1000
    X['age_to_price'] = np.clip(X['age_to_price'], 0, 1000)

## Gini has grow up from 0.3652 to 0.3657

In [41]:
pca = PCA(n_components=30)
X_pca = pca.fit_transform(X_train)
X_valid_pca = pca.transform(X_valid)

pipeline_pca = make_pipeline(StandardScaler(), GaussianNB())
pipeline_pca.fit(X_pca, y_train)
y_pred_pca = pipeline_pca.predict_proba(X_valid_pca)[:, 1]
auc_pca = roc_auc_score(y_valid, y_pred_pca)
gini_pca = gini_score(auc_pca)
print(f"AUC: {auc_pca:.4f}, Gini: {gini_pca:.4f}")

AUC: 0.6829, Gini: 0.3657


## From 0.1672 to 0.1696

In [42]:
pipeline_knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5))
pipeline_knn.fit(X_train, y_train)
y_pred_knn = pipeline_knn.predict_proba(X_valid)[:, 1]
auc_knn = roc_auc_score(y_valid, y_pred_knn)
gini_knn = gini_score(auc_knn)
print(f"AUC: {auc_knn:.4f}, Gini: {gini_knn:.4f}")

AUC: 0.5848, Gini: 0.1696


In [43]:
pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=2000, class_weight="balanced"))
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict_proba(X_valid)[:, 1]
auc = roc_auc_score(y_valid, y_pred)
gini = gini_score(auc)
print(f"AUC: {auc:.4f}, Gini: {gini:.4f}")

AUC: 0.6681, Gini: 0.3361


In [44]:
X_train.sort_values(by=["MMRAcquisitionAuctionAveragePrice"])

Unnamed: 0,VehYear,VehicleAge,WheelTypeID,VehOdo,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,MMRAcquisitonRetailCleanPrice,MMRCurrentAuctionAveragePrice,MMRCurrentAuctionCleanPrice,...,VNST_OR,VNST_PA,VNST_SC,VNST_TN,VNST_TX,VNST_UT,VNST_VA,VNST_WA,VNST_WV,age_to_price
22736,2005.0,4.0,1.0,68570.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3680,2005.0,4.0,2.0,61248.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17888,2007.0,2.0,2.0,90151.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57388,2008.0,2.0,2.0,46058.0,0.0,0.0,0.0,0.0,6114.0,7266.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18684,2006.0,3.0,2.0,55839.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32808,2008.0,1.0,1.0,13445.0,19546.0,20809.0,23361.0,24870.0,20817.0,21601.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1000.0
44238,2006.0,4.0,1.0,42686.0,20156.0,21558.0,24797.0,27371.0,19241.0,21202.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0
35554,2006.0,4.0,1.0,47227.0,21599.0,23422.0,26351.0,28819.0,20129.0,22590.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0
48954,2007.0,3.0,1.0,61001.0,23031.0,25681.0,27295.0,29981.0,21940.0,24293.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1000.0


In [45]:
model = pipeline.named_steps['logisticregression']
coefs = model.coef_[0]
feat_names = X_train.columns

for name, coef in sorted(zip(feat_names, coefs), key=lambda x: abs(x[1]), reverse=True):
    print(f"{name}: {coef:.4f}")

VehicleAge: 0.7821
MMRCurrentAuctionAveragePrice: 0.6268
Size_MEDIUM SUV: -0.6126
Make_KIA: -0.5962
VNZIP1: -0.5696
TopThreeAmericanName_OTHER: 0.5540
SubModel_4D CUV 3.0L XLT: -0.5054
VehYear: 0.4910
Make_SUZUKI: -0.4828
Make_HYUNDAI: -0.4821
WheelType_Covers: -0.4663
Make_MITSUBISHI: -0.4646
Trim_SLE: -0.4530
Nationality_OTHER ASIAN: 0.4517
SubModel_4D SUV CX: 0.4177
Size_SMALL SUV: -0.4150
VNST_TX: 0.4114
Size_MEDIUM: -0.4111
Make_NISSAN: -0.4090
Size_SPECIALTY: -0.3968
Size_SMALL TRUCK: -0.3949
Trim_CX: -0.3833
VNST_CA: 0.3753
Model_ESCAPE 2WD V6: 0.3740
VehBCost: -0.3735
Make_TOYOTA: -0.3655
Make_MAZDA: -0.3507
MMRCurrentRetailCleanPrice: -0.3494
Model_DURANGO 2WD V8: 0.3420
SubModel_4D SUV: 0.3405
Trim_CE: -0.3339
SubModel_4D SUV 4.7L SLT: -0.3274
VNST_WA: -0.3189
Model_ESCAPE 2WD V6 3.0L V: 0.3186
Model_DURANGO 4WD V8: 0.3184
VNST_AZ: 0.3105
Model_UPLANDER FWD V6: 0.3058
SubModel_4D SUV 4.7L: -0.3011
SubModel_4D UTILITY 4.2L SLE: 0.2981
Model_DURANGO 2WD V8 4.7L: 0.2973
National

In [46]:
hand_saved_features = [name for name, coef in zip(feat_names, coefs) if abs(coef) > 0.3]

X_train, X_valid, X_test = [X[hand_saved_features] for X in [X_train, X_valid, X_test]]

In [47]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict_proba(X_valid)[:, 1]
auc = roc_auc_score(y_valid, y_pred)
gini = gini_score(auc)
print(f"AUC: {auc:.4f}, Gini: {gini:.4f}")

AUC: 0.6742, Gini: 0.3484


#### With L1 regulation. Hand removal wins(by the way i changed abs(coef) about 5 time in range of 0.1 to 0.4(0.4 gini 0.27))

In [48]:
from sklearn.feature_selection import SelectFromModel

pipeline = make_pipeline(StandardScaler(),SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', C=1.0)), LogisticRegression(penalty=None))
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict_proba(X_valid)[:, 1]
auc = roc_auc_score(y_valid, y_pred)
gini = gini_score(auc)
print(f"AUC: {auc:.4f}, Gini: {gini:.4f}")

AUC: 0.6737, Gini: 0.3475


In [49]:
from sklearn.model_selection import GridSearchCV

In [50]:
param_grid = {
    'logisticregression__C': [0.1, 1, 10],
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__solver': ['liblinear', 'saga'],
}

pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=5000, class_weight="balanced"))

grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='roc_auc')
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Best AUC: {grid.best_score_:.4f}")



Best parameters: {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'saga'}
Best AUC: 0.6815


#### между трейн и тест джини примерно равный скор => модель устойчивая

In [51]:
y_train_pred = grid.best_estimator_.predict_proba(X_train)[:, 1]
y_valid_pred = grid.best_estimator_.predict_proba(X_valid)[:, 1]
y_test_pred = grid.best_estimator_.predict_proba(X_test)[:, 1]

gini_train = gini_score(roc_auc_score(y_train, y_train_pred))
gini_valid = gini_score(roc_auc_score(y_valid, y_valid_pred))
gini_test = gini_score(roc_auc_score(y_test, y_test_pred))
print(f"Train Gini: {gini_train:.4f}, Valid Gini: {gini_valid:.4f}, Test Gini: {gini_test:.4f}")

Train Gini: 0.3738, Valid Gini: 0.3481, Test Gini: 0.3619


In [52]:
def precision_score_loqu(y_true, y_pred):
    tp = ((y_true == 1) & (y_pred == 1)).sum()
    fp = ((y_true == 0) & (y_pred == 1)).sum()
    return tp / (tp + fp) if tp + fp > 0 else 0

def recall_score_loqu(y_true, y_pred):
    tp = ((y_true == 1) & (y_pred == 1)).sum()
    fn = ((y_true == 1) & (y_pred == 0)).sum()
    return tp / (tp + fn) if tp + fn > 0 else 0

def f1_score_loqu(y_true, y_pred):
    prec = precision_score_loqu(y_true, y_pred)
    rec = recall_score_loqu(y_true, y_pred)
    return 2 * (prec * rec) / (prec + rec) if prec + rec > 0 else 0

In [53]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [54]:
y_test_pred = (y_test_pred >= 0.3).astype(int)

precision_score_loqu(y_test, y_test_pred) == precision_score(y_test, y_test_pred)

True

In [55]:
recall_score_loqu(y_test, y_test_pred) == recall_score(y_test, y_test_pred)

True

In [56]:
f1_score_loqu(y_test, y_test_pred) == f1_score(y_test, y_test_pred)

False

## Better to use recall as it shows how much we find lemon cars. We found 95 lemons for each 100 lemons. For sure we can recognize good cars as lemons, but in that case the generalistic task is to find as much lemon as we can

In [57]:
recall_score_loqu(y_test, y_test_pred)

0.9460906818993217

In [58]:
f1_score(y_test, y_test_pred)

0.24491682070240295

In [59]:
f1_score_loqu(y_test, y_test_pred)

0.24491682070240298