In [None]:
# %conda install -c pytorch faiss-cpu
%conda install -c conda-forge faiss-gpu

In [None]:
%conda install -c "conda-forge/label/broken" faiss-gpu

In [1]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.express as px
import faiss
import sweetviz as sv
import numpy as np
from sklearn.preprocessing import RobustScaler



In [2]:
train = pd.read_csv('./Data/train.csv', index_col=0)
valid = pd.read_csv('./Data/validation.csv', index_col=0)
valid_awr = pd.read_csv('./Data/validation_answer.csv', index_col=0)
base = pd.read_csv('./Data/base.csv', index_col=0)

In [None]:
train.describe().T\
    .style.bar(subset=['mean'], color=px.colors.qualitative.G10[2])\
    .background_gradient(subset=['std'], cmap='Blues')\
    .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
base.describe().T\
    .style.bar(subset=['mean'], color=px.colors.qualitative.G10[2])\
    .background_gradient(subset=['std'], cmap='Blues')\
    .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
# train_report = sv.analyze(train)

In [None]:
# train_report.show_html('Train_report.html')

In [None]:
d = base.shape[1]
d

In [3]:
ngpu = 1

## L2

In [None]:
index = faiss.IndexFlatL2(d)
index.add(base)

In [None]:
resources = [faiss.StandardGpuResources() for i in range(ngpu)]
index_gpu = faiss.index_cpu_to_gpu_multiple_py(resources, index)

In [13]:
base_index = {k: v for k, v in enumerate(base.index.to_list())}

In [14]:
targets = train["Target"]

In [None]:
%%time
D, I = index_gpu.search(train.drop('Target', axis=1), 5)
acc = 0
for target, el in zip(targets.values.tolist(), I.tolist()):
    acc += int(target in [base_index[r] for r in el])

print(100 * acc / len(I))

## IVF

In [None]:
nlist = 144

quant = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quant, d, nlist)
index.train(base)
index.add(base)
index.nprobe = 8
resources = [faiss.StandardGpuResources() for i in range(ngpu)]
index_gpu = faiss.index_cpu_to_gpu_multiple_py(resources, index)

In [None]:
%%time
D, I = index_gpu.search(train.drop('Target', axis=1), 5)
acc = 0
for target, el in zip(targets.values.tolist(), I.tolist()):
    acc += int(target in [base_index[r] for r in el])

print(100 * acc / len(I))

In [None]:
scaler = RobustScaler()
base2 = scaler.fit_transform(base)
train2 = scaler.transform(train.drop('Target', axis=1))

In [None]:
index = faiss.IndexFlatL2(d)
index.add(base2)

In [None]:
resources = [faiss.StandardGpuResources() for i in range(ngpu)]
index_gpu = faiss.index_cpu_to_gpu_multiple_py(resources, index)

In [None]:
%%time
D, I = index_gpu.search(train2, 5)
acc = 0
for target, el in zip(targets.values.tolist(), I.tolist()):
    acc += int(target in [base_index[r] for r in el])

print(100 * acc / len(I))

In [None]:
nlist = 144

quant = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quant, d, nlist)
index.train(base2)
index.add(base2)
index.nprobe = 8
resources = [faiss.StandardGpuResources() for i in range(ngpu)]
index_gpu = faiss.index_cpu_to_gpu_multiple_py(resources, index)

In [None]:
%%time
D, I = index_gpu.search(train2, 5)
acc = 0
for target, el in zip(targets.values.tolist(), I.tolist()):
    acc += int(target in [base_index[r] for r in el])

print(100 * acc / len(I))

In [None]:
plt.figure(figsize=(150, 130))
sb.heatmap(train.corr(method='spearman'), annot=True, cmap=sb.diverging_palette(220, 10, as_cmap=True));
plt.show()

In [4]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [5]:
vif_data = pd.DataFrame()
vif_data["feature"] = train.drop('Target', axis=1).columns

# вычисление VIF для каждого признака
vif_data["VIF"] = [variance_inflation_factor(train.drop('Target', axis=1).values, i)
                          for i in range(len(train.drop('Target', axis=1).columns))]
  
print(vif_data)

   feature        VIF
0        0  12.153339
1        1   3.451413
2        2   2.283375
3        3  52.603988
4        4   6.629906
..     ...        ...
67      67   1.194555
68      68   2.439916
69      69  29.866330
70      70   4.127809
71      71   2.428688

[72 rows x 2 columns]


In [6]:
vif_data.sort_values(by='VIF')

Unnamed: 0,feature,VIF
65,65,1.024089
16,16,1.025868
22,22,1.027528
13,13,1.046378
17,17,1.069599
...,...,...
56,56,277.061764
54,54,333.049631
8,8,351.987728
66,66,1260.625727


In [7]:
VIF_features_drop = vif_data.query('VIF >= 8.0')['feature']
VIF_features_drop

0      0
3      3
8      8
10    10
14    14
18    18
19    19
20    20
21    21
25    25
27    27
29    29
31    31
32    32
35    35
37    37
38    38
44    44
52    52
54    54
55    55
56    56
61    61
62    62
64    64
66    66
69    69
Name: feature, dtype: object

In [8]:
base3 = base.drop(VIF_features_drop.values, axis=1)
train3 = train.drop(VIF_features_drop.values, axis=1)

In [None]:
plt.figure(figsize=(150, 130))
sb.heatmap(train3.corr(method='spearman'), annot=True, cmap=sb.diverging_palette(220, 10, as_cmap=True));
plt.show()

In [9]:
d_base3 = base3.shape[1]
d_base3

45

In [10]:
base3_index = {k: v for k, v in enumerate(base3.index.to_list())}

In [12]:
scaler = RobustScaler()
base3 = scaler.fit_transform(base3)
train3 = scaler.transform(train3.drop('Target', axis=1))

In [15]:
index = faiss.IndexFlatL2(d_base3)
index.add(base3)

In [16]:
resources = [faiss.StandardGpuResources() for i in range(ngpu)]
index_gpu = faiss.index_cpu_to_gpu_multiple_py(resources, index)

In [None]:
%%time
D, I = index_gpu.search(train3, 5)
acc = 0
for target, el in zip(targets.values.tolist(), I.tolist()):
    acc += int(target in [base3_index[r] for r in el])

print(100 * acc / len(I))

In [17]:
nlist = 144

quant = faiss.IndexFlatIP(d_base3)
index = faiss.IndexIVFFlat(quant, d_base3, nlist)
index.train(base3)
index.add(base3)
index.nprobe = 8
resources = [faiss.StandardGpuResources() for i in range(ngpu)]
index_gpu = faiss.index_cpu_to_gpu_multiple_py(resources, index)

In [18]:
%%time
D, I = index_gpu.search(train3, 5)
acc = 0
for target, el in zip(targets.values.tolist(), I.tolist()):
    acc += int(target in [base3_index[r] for r in el])

print(100 * acc / len(I))

65.434
CPU times: total: 5.27 s
Wall time: 12.5 s


In [19]:
# set HNSW index parameters
M = 64  # number of connections each vertex will have
ef_search = 32  # depth of layers explored during search
ef_construction = 64  # depth of layers explored during index construction

# initialize index (d == 128)
index = faiss.IndexHNSWFlat(d_base3, M)
# set efConstruction and efSearch parameters
index.hnsw.efConstruction = ef_construction
index.hnsw.efSearch = ef_search
# add data to index
index.add(base3)

In [20]:
resources = [faiss.StandardGpuResources() for i in range(ngpu)]
index_gpu = faiss.index_cpu_to_gpu_multiple_py(resources, index)

In [21]:
%%time
D, I = index_gpu.search(train3, 5)
acc = 0
for target, el in zip(targets.values.tolist(), I.tolist()):
    acc += int(target in [base3_index[r] for r in el])

print(100 * acc / len(I))

64.616
CPU times: total: 56 s
Wall time: 5.16 s
