## Запуск семантического парсера в интерактивной среде

In [114]:
from semantic_parser import SemanticParser
parser = SemanticParser('config.json')
print(parser.parse('предлагаю нежилое помещение на 1 этаже, общий вход со двора'))

{'entrance_placement': 'yard', 'entrance_type': 'joint', 'purpose': 'non_residential', 'floor': 1}


In [17]:
print(parser.parse('предлагаю нежилое помещение площадью 34 кв метра 2 сан узла  на 1 этаже, общий вход со двора'))

{'entrance_placement': 'yard', 'entrance_type': 'joint', 'purpose': 'non_residential', 'floor': 1}


In [18]:
print(parser.parse('предлагаю нежилое помещение на 4 этаже, общий вход со двора'))

{'entrance_type': 'joint', 'purpose': 'non_residential'}


In [21]:
text = "7225 Сдается помещение свободного назначения с отдельным входом. Пешая доступность от станции метро. Идеально под общепит полного цикла, бар, кальянную. Большие витринные окна. Рекламный потенциал. Электрическая мощность 50 +кВТ. Мокрые точки. Вытяжка. Коммунальные и эксплуатационные платежи по счетчикам. Парковка стихийная. Каникулы по условиям договора. Аренда прямая. Предоставление юр. адреса обсуждается. БЕЗ КОМИССИИ. first 78.77% not_included 89.87% display_window 99.74% unknown 99.79% separate 99.99% non_residential Павильон  на Мельнице в аренду.Сдается в аренду павильон на ТОГК Славянский мир ( Строительная ярмарка Мельница).. Павильон расположен  на второй линии  от центрального въезда со МКАД на ТОГК Славянский мир (строительный рынок  Мельница"

In [23]:
print(parser.parse(text))

{'entrance_placement': 'street', 'entrance_type': 'separate', 'display_window': 'display_window', 'purpose': 'non_residential', 'communal_included': 'not_included'}


In [2]:
from sklearn.pipeline import make_pipeline, make_union
from sklearn.decomposition import PCA, TruncatedSVD
make_union(PCA(), TruncatedSVD())    

FeatureUnion(n_jobs=None,
             transformer_list=[('pca',
                                PCA(copy=True, iterated_power='auto',
                                    n_components=None, random_state=None,
                                    svd_solver='auto', tol=0.0, whiten=False)),
                               ('truncatedsvd',
                                TruncatedSVD(algorithm='randomized',
                                             n_components=2, n_iter=5,
                                             random_state=None, tol=0.0))],
             transformer_weights=None, verbose=False)

## Работа с базой данных

In [1]:
import psycopg2
import codecs

In [23]:
connection = psycopg2.connect(
                         dbname='sas_db', 
                         user='sas', 
                         password='oxQ2eJT7Snsz',
                         host='185.98.83.27', 
                         port='5100' 
                        )
cursor = connection.cursor()

In [44]:
#print ( connection.get_dsn_parameters(),"\n")

cursor.execute("SELECT version();")
record = cursor.fetchone()
#print("You are connected to - ", record,"\n")

cursor.execute("SELECT pg_database_size('sas_db')")
cursor.fetchone()

cursor.execute("SELECT table_name FROM information_schema.tables WHERE table_schema NOT IN ('information_schema','pg_catalog')")
print (cursor.fetchall())

cursor.execute("SELECT pg_relation_size('buildings_lease')")
print (cursor.fetchmany(10))

print ("Done!")

[('buildings',), ('buildings_lease',), ('businesses',), ('placements5',), ('businesses_lease',), ('garages',), ('garages_lease',), ('industrials',), ('industrials_lease',), ('livings',), ('offices',), ('offices_lease',), ('livings_mo',), ('commercial_requests_histrory',), ('placements_clean',), ('placements_depr',), ('placements_depr_lease',), ('shoppings_lease',), ('zkh',), ('shoppings',), ('warehouses_lease',), ('warehouses',), ('residential_real_estate_history',), ('alembic_version',)]
[(2170880,)]
Done!


In [41]:
size = 25239552
cursor.execute("SELECT * FROM buildings_lease LIMIT {size}".format(size=size))
#cursor.execute("SELECT * FROM buildings_lease LIMIT {size}".format(size=size))
col_names = [desc[0] for desc in cursor.description]
batch_size = 1024
batch = cursor.fetchmany(size)
with codecs.open("f1.tsv", mode='w', encoding='utf-8') as f:
    f.write("\t".join(col_names))
    f.write("\n")
    f.flush()
    for start_index in range(0, size, batch_size):
        for line in batch[start_index: start_index+batch_size]:
            f.write("\t".join(str(i) for i in line))
            f.write("\n")
        f.flush()

In [42]:
size = 3
cursor.execute("SELECT * FROM buildings_lease LIMIT {size}".format(size=size))
col_names = [desc[0] for desc in cursor.description]
print (col_names)
#print (cursor.description[37])

['id', 'source', 'externalid', 'site', 'url', 'screenpath', 'adtype', 'category', 'created_at', 'updated_at', 'parsed_at', 'description', 'agentphone', 'agentcategory', 'agentcompany', 'agentname', 'fulladdress', 'county', 'fedokrug', 'region', 'district', 'localityname', 'subdistrict', 'sublocalityname', 'street', 'house', 'building', 'postalcode', 'apartment', 'latitude', 'longitude', 'dirtyaddress', 'precision', 'yandexfile', 'metroname', 'metrodistance', 'metrotransport', 'metrotime', 'propertytype', 'conditiontype', 'layout', 'totalarea', 'hasfurniture', 'entrance', 'availablefrom', 'taxnumber', 'isinhiddenbase', 'buildingname', 'buildingtype', 'houselinetype', 'buildingclass', 'developer', 'managementcompany', 'ventilationtype', 'conditioningtype', 'extinguishingsystemtype', 'statustype', 'landarea', 'landunit', 'landtype', 'liftscount', 'lifttype1', 'loadcapacity1', 'lifttype2', 'loadcapacity2', 'lifttype3', 'loadcapacity3', 'floorscount', 'ceilingheight', 'heatingtype', 'parkin

In [43]:
for i in range(len(col_names)):
    if col_names[i] == 'houselinetype':
        print ('houselinetype:', i)
    if col_names[i] == 'hasshopwindows':
        print ('hasshopwindows:', i)
    if col_names[i] == 'entrance':
        print ('entrance:', i)
    if col_names[i] == 'floorscount':
        print ('floorscount:', i)
    if col_names[i] == 'conditiontype':
        print ('conditiontype:', i)
    if col_names[i] == 'vattype':
        print ('vattype:', i)
    if col_names[i] == 'isbuildingliving':
        print ('isbuildingliving:', i)  
    if col_names[i] == 'description':
        print ('description:', i)  
        

description: 11
conditiontype: 39
entrance: 43
houselinetype: 49
floorscount: 67
vattype: 81


In [66]:
print (cursor.description)

None


In [7]:
cursor.close()
connection.close()

In [11]:
with codecs.open("f.tsv", mode='r', encoding='utf-8') as f:
    for i in range (1):
        print( f.readline())

id	RoomsNum	Storey	StoreysNum	BuildingType	BuildingSeries	BuildingPeriod	Price	TotalArea	LivingSpaceArea	KitchenArea	RawAddress	Region	City	Street	House	Building	FlatNumber	SubwayStation	SubwayTime	SubwayTimeType	PhoneNumber	Url	DateOfAnnouncement	Additional	SourceId	LastSeen	Archive	Hash	Precision	Address	ScreenshotFilePath	created_at	modified_at	Studio	PriceStrict	SubwayDistance	Comment	SubwayStation2	SubwayStation3	SubwayDistance2	SubwayDistance3	Version	RegionDistrict	District	MicroDistrict	IsNewBuilding	HandOverDateRaw	RoomsAreaRaw	SaleType	IsPremium	ElevatorsNum	ServiceElevatorsNum	BalconiesNum	WCsNum	WindowView	LocalPhoneExists	InternetExists	GasSupplyExists	SecurityExists	RefuseChuteExists	RepairRaw	BuildingId	DistrictId	AgentName	AgentId	AgentPhonenums	Source	SaleTypeRaw	IsPayed	LoggiasNum	WCsJointNum	PrevId	Lat	Lon	OpenLayout	Room	Part	BuildingYear	PriceUsd	HeatSupply	WaterSupply	CeilingHeight	MultiRoom	ApartmentType	AgentType	AgentAdditionalInfo	AgentUrl	GdeEtotDomHouseId	Da

In [40]:
25239552/1024

24648.0

In [None]:
# 128*2

## Работа с кусками кода

In [1]:
import numpy as np

In [2]:
a = np.array([1,2,3])
b = np.array([1,2,3])
print (a,b)

[1 2 3] [1 2 3]


In [3]:
print (a*b)

[1 4 9]
