In [53]:
import os
import pandas as pd
import numpy as np
import base64
from PIL import Image
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import os
from collections import defaultdict
import hashlib

import utils

In [54]:
curr_dir = os.getcwd()

### For raw data

In [66]:
# Data folder path
data_path = os.path.join(curr_dir, 'clear_data')
all_data_files = os.listdir(data_path)
city_files = {}

for file_name in all_data_files:
    parts = file_name.split('_')
    city_key = parts[0]
    if city_key not in city_files:
        city_files[city_key] = {}
    if 'images' in file_name and 'pil' not in file_name:
        city_files[city_key]['images'] = os.path.join(data_path, file_name)
    elif 'places' in file_name:
        city_files[city_key]['places'] = os.path.join(data_path, file_name)

for city, files in city_files.items():
    print(f"City: {city}")
    print(f"Images: {files.get('images', 'Не найден')}")
    print(f"Places: {files.get('places', 'Не найден')}")
    print()

City: EKB
Images: c:\Users\alex\Desktop\NTO\clear_data\EKB_images.csv
Places: c:\Users\alex\Desktop\NTO\clear_data\EKB_places.csv

City: NN
Images: c:\Users\alex\Desktop\NTO\clear_data\NN_images.csv
Places: c:\Users\alex\Desktop\NTO\clear_data\NN_places.csv

City: Vladimir
Images: c:\Users\alex\Desktop\NTO\clear_data\Vladimir_images.csv
Places: c:\Users\alex\Desktop\NTO\clear_data\Vladimir_places.csv

City: Yaroslavl
Images: c:\Users\alex\Desktop\NTO\clear_data\Yaroslavl_images.csv
Places: c:\Users\alex\Desktop\NTO\clear_data\Yaroslavl_places.csv



In [69]:
def prosess_data(city_files):
    merged_data_all = None

    for city, files in city_files.items():
        places = pd.read_csv(files.get('places'), sep = ',')
        images = pd.read_csv(files.get('images'), sep = ',')

        aggregated_places = places.groupby('WikiData').agg({'Name': utils.aggregate_names,
                                                            'Kind': 'first',
                                                            'City': 'first',
                                                            'Rate': 'first',
                                                            'Lon': 'mean',
                                                            'Lat': 'mean'}).reset_index()

        aggregated_places_exploded = aggregated_places.explode('Name')
        images_exploded = images.explode('name')

        merged_data = pd.merge(aggregated_places_exploded, 
                               images_exploded, 
                               left_on='Name', right_on='name', how='right')
        
        # Вывод записей, которым не удалось найти пару
        unmatched_records = merged_data[merged_data['name'].isnull()]
        if not unmatched_records.empty:
            print(unmatched_records) # Should be empty

        merged_data_grouped = merged_data.groupby('WikiData')['Name'].agg(utils.choose_name).reset_index()
        merged_data = pd.merge(merged_data_grouped, 
                               merged_data, 
                               on='WikiData', how='left')
        merged_data.rename(columns={'Name_x': 'Name'}, inplace=True)
        merged_data.drop(columns=['Name_y', 'name'], inplace=True)

        output_dir = os.path.join(curr_dir, 'clear_data_by_name')
        os.makedirs(output_dir, exist_ok=True)
        image_paths = []

        for index, row in merged_data.iterrows():
            # with open("success.txt", "a") as out:
                # print to file 
                # print(row['Name'], file=out)
            image_data = base64.b64decode(row['image'])
            class_dir = os.path.join(output_dir, utils.replace_forbidden_chars(row['Name']))
            os.makedirs(class_dir, exist_ok=True)
            image_path = os.path.join(class_dir, f"image_{index}.jpg")
            with open(image_path, "wb") as file:
                file.write(image_data)
            image_paths.append(image_path)

        merged_data['image_path'] = image_paths
        # merged_data.drop(columns=['image'], inplace=True)

        if merged_data_all is not None:
            merged_data_all = pd.concat([merged_data_all, merged_data], ignore_index=True)
        else:
            merged_data_all = merged_data

    csv_file_path = 'clear_processed_data.csv'
    merged_data_all.to_csv(csv_file_path, index=False)
    return merged_data_all

In [70]:
merged_data_all = None
merged_data_all = prosess_data(city_files)

In [71]:
merged_data_all.shape

(11800, 10)

In [29]:
output_dir = os.path.join(curr_dir)

for index, row in merged_data_all.iterrows():
    index
    image_data = base64.b64decode(row['image'])
    class_dir = os.path.join(output_dir, utils.replace_forbidden_chars(row['Name']))
    os.makedirs(class_dir, exist_ok=True)
    image_path = os.path.join(class_dir, f"image_{index}.jpg")
    with open(image_path, "wb") as file:
            file.write(image_data)
    break

### Data cleaner

In [51]:
import pandas as pd
import os
# from PIL import Image
from IPython.display import display, clear_output
from ipywidgets import Button, HBox, VBox, Image, Output, Layout, Label

# df = pd.read_csv('processed_data.csv')

df = pd.read_csv('data/Vladimir_images.csv') # df[df['City'] == 'Владимир']
df['index_copy'] = df.index

output = Output()

current_index = 0

def display_current_record(current_index):
    with output:
        clear_output(wait=True)
        if current_index >= 0 and current_index < len(df):
            row = df.iloc[current_index]

            try:
                image_data = base64.b64decode(row['image'])
                image_widget = Image(value=image_data, format='jpg', width=250, height=250)
            except FileNotFoundError:
                print(f"Не удалось найти изображение по пути: {row['image']}. Удаляю запись.")
                df.drop(df.index[current_index], inplace=True)
                df.reset_index(drop=True, inplace=True)
                display_current_record(current_index)
                return

            delete_button = Button(description="Удалить запись", 
                                   layout=Layout(width='150px'))
            next_button = Button(description="Следующая запись", 
                                 layout=Layout(width='150px'))
            prev_button = Button(description="Предыдущая запись", 
                                 layout=Layout(width='150px'))
            info_label = Label(f"Текущий размер DataFrame: {len(df)}, Шаг: {current_index + 1} из {len(df)}")

            def delete_record(b):
                global current_index
                img_path = row['image']
                df.drop(df.index[current_index], inplace=True)
                df.reset_index(drop=True, inplace=True)
                if os.path.exists(img_path):
                    os.remove(img_path)
                display_current_record(current_index)

            def display_next_record(b=None):
                global current_index
                current_index += 1
                display_current_record(current_index)

            def display_prev_record(b=None):
                global current_index
                current_index -= 1
                display_current_record(current_index)

            delete_button.on_click(delete_record)
            next_button.on_click(display_next_record)
            prev_button.on_click(display_prev_record)

            display(VBox([HBox([image_widget, VBox([delete_button, next_button, prev_button])]), info_label]))
        else:
            print("Больше записей нет.")

display_current_record(current_index)
display(output)

Output()

### Clear data processing

In [64]:
# Data folder path
clear_data_path = os.path.join(curr_dir, 'clear_data')
all_data_files = os.listdir(clear_data_path)
city_files = {}

for file_name in all_data_files:
    parts = file_name.split('_')
    city_key = parts[0]
    if city_key not in city_files:
        city_files[city_key] = {}
    if 'images' in file_name and 'pil' not in file_name:
        city_files[city_key]['images'] = os.path.join(data_path, file_name)
    elif 'data' in file_name:
        city_files[city_key]['data'] = os.path.join(data_path, file_name)

for city, files in city_files.items():
    print(f"City: {city}")
    print(f"Images: {files.get('images', 'Not found')}")
    print(f"Data: {files.get('data', 'Not found')}")

City: EKB
Images: c:\Users\alex\Desktop\NTO\data\EKB_images.csv
Data: c:\Users\alex\Desktop\NTO\data\EKB_processed_data.csv
City: NN
Images: c:\Users\alex\Desktop\NTO\data\NN_images.csv
Data: c:\Users\alex\Desktop\NTO\data\NN_processed_data.csv
City: Vladimir
Images: c:\Users\alex\Desktop\NTO\data\Vladimir_images.csv
Data: c:\Users\alex\Desktop\NTO\data\Vladimir_processed_data.csv
City: Yaroslavl
Images: c:\Users\alex\Desktop\NTO\data\Yaroslavl_images.csv
Data: c:\Users\alex\Desktop\NTO\data\Yaroslavl_processed_data.csv


In [None]:
def clear_prosess_data(city_files):
    merged_data_all = None

    for city, files in city_files.items():
        data = pd.read_csv(files.get('places'), sep = ',')
        images = pd.read_csv(files.get('images'), sep = ',')

        data_exploded = data.explode('Name')
        images_exploded = images.explode('name')

        merged_data = pd.merge(data_exploded, 
                               images_exploded, 
                               left_on='Name', right_on='name', how='left')
        
        unmatched_records = merged_data[merged_data['name'].isnull()]
        if not unmatched_records.empty:
            print(unmatched_records) # Should be empty

        merged_data_grouped = merged_data.groupby('WikiData')['Name'].agg(utils.choose_name).reset_index()
        merged_data = pd.merge(merged_data_grouped, 
                               merged_data, 
                               on='WikiData', how='left')
        merged_data.rename(columns={'Name_x': 'Name'}, inplace=True)
        merged_data.drop(columns=['Name_y', 'name'], inplace=True)

        output_dir = os.path.join(curr_dir, 'data_by_name')
        os.makedirs(output_dir, exist_ok=True)
        image_paths = []

        for index, row in merged_data.iterrows():
            # with open("success.txt", "a") as out:
                # print to file 
                # print(row['Name'], file=out)
            image_data = base64.b64decode(row['image'])
            class_dir = os.path.join(output_dir, utils.replace_forbidden_chars(row['Name']))
            os.makedirs(class_dir, exist_ok=True)
            image_path = os.path.join(class_dir, f"image_{index}.jpg")
            with open(image_path, "wb") as file:
                file.write(image_data)
            image_paths.append(image_path)

        merged_data['image_path'] = image_paths
        # merged_data.drop(columns=['image'], inplace=True)

        if merged_data_all is not None:
            merged_data_all = pd.concat([merged_data_all, merged_data], ignore_index=True)
        else:
            merged_data_all = merged_data

    csv_file_path = 'processed_data.csv'
    merged_data_all.to_csv(csv_file_path, index=False)
    return merged_data_all

### Tags analysis

In [78]:
# Get sample data
sample = merged_data_all.sample(n=10, random_state=42)
sample.head()

Unnamed: 0,WikiData,Name,Kind,City,Rate,Lon,Lat,image,image_path,index_copy
7819,Q62560931,Музей ложки,"cultural,museums,children_museums,interesting_...",Владимир,3,40.40321,56.127506,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUEBAUEAwUFBA...,c:\Users\alex\Desktop\NTO\clear_data_by_name\М...,847.0
533,Q190779,Центральный стадион,"sport,architecture,historic_architecture,inter...",Екатеринбург,3h,60.573956,56.832283,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUEBAUEAwUFBA...,c:\Users\alex\Desktop\NTO\clear_data_by_name\Ц...,
8777,Q19909049,Парк на Стрелке,"gardens_and_parks,cultural,urban_environment,i...",Ярославль,3,39.902702,57.622063,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUEBAUEAwUFBA...,c:\Users\alex\Desktop\NTO\clear_data_by_name\П...,
5025,Q84609804,Доходный дом купчихи П. Е. Кубаревой,"architecture,historic_architecture,interesting...",Нижний Новгород,3h,43.977024,56.324112,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUEBAUEAwUFBA...,c:\Users\alex\Desktop\NTO\clear_data_by_name\Д...,
9889,Q4151348,Губернаторский дом (Ярославль),"historic_architecture,architecture,interesting...",Ярославль,3h,39.897221,57.628334,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAUEBAUEAwUFBA...,c:\Users\alex\Desktop\NTO\clear_data_by_name\Г...,


In [77]:
def top_tags_distribution(df, top_n=5, probability=False):
    """
    Function to build the distribution of tags by the probability of occurrence and output the top N tags.

    Parameters:
        df (DataFrame): DataFrame containing records with tags.
        top_n (int): Number of tags to output in the top. Default is 5.
        probability (bool): Flag indicating whether to output probabilities in probability form.

    Returns:
        Series: Series with the top N tags and their probabilities of occurrence.

    """

    all_tags = []
    for tags in df['Kind']:
        if isinstance(tags, str):
            all_tags.extend(tags.split(','))

    tag_counts = pd.Series(all_tags).value_counts()

    if probability:
        total_tags = tag_counts.sum()
        tag_probs = tag_counts / total_tags
    else:
        tag_probs = tag_counts

    return tag_probs.head(top_n)

top_tags = top_tags_distribution(sample, top_n=5, probability=True)
print(top_tags)

interesting_places                0.217391
cultural                          0.086957
architecture                      0.086957
historic_architecture             0.086957
other_buildings_and_structures    0.086957
Name: count, dtype: float64
