In [1]:
from sqlalchemy import create_engine
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv

def create_connection():

    load_dotenv()
    host = os.environ.get('DB_DESTINATION_HOST')
    port = os.environ.get('DB_DESTINATION_PORT')
    db = os.environ.get('DB_DESTINATION_NAME')
    username = os.environ.get('DB_DESTINATION_USER')
    password = os.environ.get('DB_DESTINATION_PASSWORD')

    conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}')
    return conn

# установите соединение с базой
conn = create_connection()

data = pd.read_sql('''
                   select f.id,f.floor,f.is_apartment,f.kitchen_area,f.living_area,f.rooms,
                   f.studio,f.total_area,f.price,f.building_id,
                   b.build_year,b.building_type_int,b.latitude,b.longitude,
                   b.ceiling_height,b.flats_count,b.floors_total,b.has_elevator 
                   from flats as f
                   join buildings as b on f.building_id=b.id
                   ''', conn)
data

Unnamed: 0,id,floor,is_apartment,kitchen_area,living_area,rooms,studio,total_area,price,building_id,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator
0,0,9,False,9.90,19.900000,1,False,35.099998,9500000,6220,1965,6,55.717113,37.781120,2.64,84,12,True
1,1,7,False,0.00,16.600000,1,False,43.000000,13500000,18012,2001,2,55.794849,37.608013,3.00,97,10,True
2,2,9,False,9.00,32.000000,2,False,56.000000,13500000,17821,2000,4,55.740040,37.761742,2.70,80,10,True
3,3,1,False,10.10,43.099998,3,False,76.000000,20000000,18579,2002,4,55.672016,37.570877,2.64,771,17,True
4,4,3,False,3.00,14.000000,1,False,24.000000,5200000,9293,1971,1,55.808807,37.707306,2.60,208,9,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141357,141357,16,False,11.00,18.000000,1,False,42.000000,10500000,22455,2013,4,55.626579,37.313503,2.64,672,25,True
141358,141358,5,False,5.28,28.330000,2,False,41.110001,7400000,3162,1960,1,55.727470,37.768677,2.48,80,5,False
141359,141359,7,False,5.30,20.000000,1,False,31.500000,9700000,6513,1966,4,55.704315,37.506584,2.64,72,9,True
141360,141360,15,False,13.80,33.700001,2,False,65.300003,11750000,23952,2017,4,55.699863,37.939564,2.70,480,25,True


In [8]:
is_duplicated_id = data.duplicated(subset=['id'], keep=False)
# параметр keep = False приводит к тому, что и оригинал, и дубликат помечаются как объект с дубликатом
print(sum(is_duplicated_id)) 

0


In [9]:
feature_cols = data.columns.difference(['id']).tolist()
#print(feature_cols)
is_duplicated_features = data.duplicated(subset=feature_cols,keep=False)
#print(is_duplicated_features)
#data["is_duplicated_features"]=is_duplicated_features
#print(data)
print(len(data[is_duplicated_features]))

17425


In [10]:
print(data[is_duplicated_features].sort_values(feature_cols)) 

            id  floor  is_apartment  kitchen_area  living_area  rooms  studio  \
52640    52640      3         False           0.0          0.0      4   False   
54009    54009      3         False           0.0          0.0      4   False   
90882    90882      4         False          11.0         48.0      2   False   
122183  122183      4         False          11.0         48.0      2   False   
63568    63568      5         False          10.0         74.0      3   False   
...        ...    ...           ...           ...          ...    ...     ...   
100793  100793      2         False          18.0          0.0      3   False   
100937  100937      2         False          18.0          0.0      3   False   
132927  132927      2         False          18.0          0.0      3   False   
32684    32684     14         False           0.0          0.0      1   False   
128245  128245     14         False           0.0          0.0      1   False   

        total_area     pric