In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns

In [None]:
# загрузим датасет
processed_data = pd.read_csv('1/Books.csv')

df = pd.DataFrame(processed_data)

In [147]:
# from ydata_profiling import ProfileReport
# profile = ProfileReport(df,)
# profile.to_file('report1.html');

In [4]:
pd.set_option('display.max_columns', None)  # Показать все столбцы
pd.set_option('display.width', 1900)        # Установить ширину области отображения
pd.set_option('display.max_colwidth', None)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   isbn10          6810 non-null   object 
 2   title           6810 non-null   object 
 3   subtitle        2381 non-null   object 
 4   authors         6738 non-null   object 
 5   categories      6711 non-null   object 
 6   thumbnail       6481 non-null   object 
 7   description     6548 non-null   object 
 8   published_year  6804 non-null   float64
 9   average_rating  6767 non-null   float64
 10  num_pages       6767 non-null   float64
 11  ratings_count   6767 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 638.6+ KB


In [6]:
# Удалим лишние столбцы:
df = df.drop(columns=['isbn13', 'isbn10', 'subtitle', 'thumbnail', 'published_year', 'num_pages', 'ratings_count'])
df.sample()

Unnamed: 0,title,authors,categories,description,average_rating
1588,Pacific Edge,Kim Stanley Robinson,Fiction,"Set at the end of the 21st century in California, this story revolves around a seemingly perfect society. At first, bio-architect Kevin Claiborne thinks he has indeed found Utopia, but gradually events lead him to discover the corruption beneath the surface.",3.78


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           6810 non-null   object 
 1   authors         6738 non-null   object 
 2   categories      6711 non-null   object 
 3   description     6548 non-null   object 
 4   average_rating  6767 non-null   float64
dtypes: float64(1), object(4)
memory usage: 266.1+ KB


In [8]:
# маска. Количество пропусков
null_d=df.isnull().sum()
print(null_d[null_d>0])

authors            72
categories         99
description       262
average_rating     43
dtype: int64


In [9]:
# Удаляем строки, имеющие пропуски в столбце description
df = df.dropna(subset=['description'])
df.info()
# маска. Количество пропусков
null_d=df.isnull().sum()
print(null_d[null_d>0])

<class 'pandas.core.frame.DataFrame'>
Index: 6548 entries, 0 to 6809
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           6548 non-null   object 
 1   authors         6483 non-null   object 
 2   categories      6511 non-null   object 
 3   description     6548 non-null   object 
 4   average_rating  6511 non-null   float64
dtypes: float64(1), object(4)
memory usage: 306.9+ KB
authors           65
categories        37
average_rating    37
dtype: int64


In [10]:
# Проверка на дубликаты по столбцам 'title' и 'authors'
duplicates = df[df.duplicated(subset=['title', 'authors'], keep=False)]

# Вывод всех найденных дубликатов
print("Найдено дубликатов:", duplicates.shape[0])
print(duplicates)

Найдено дубликатов: 319
                          title                      authors                              categories                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [11]:
# Удалим дубликаты по столбцам 'title', 'authors'
df = df.drop_duplicates(subset=['title', 'authors'])

In [12]:
# маска. Количество пропусков
null_d=df.isnull().sum()
print(null_d[null_d>0])

authors           65
categories        35
average_rating    34
dtype: int64


In [13]:
# Статистика числовых признаков (average_rating)
df.describe()

Unnamed: 0,average_rating
count,6344.0
mean,3.930372
std,0.322904
min,0.0
25%,3.76
50%,3.95
75%,4.13
max,5.0


In [14]:
# Заполним пропуски в столбце average_rating случайными значениями от 3 до 5

# Количество пропусков
num_missing = df['average_rating'].isnull().sum()

# Генерация случайных значений с округлением до сотых
random_ratings = np.round(np.random.uniform(3, 5, size=num_missing), 2)
print(random_ratings)

# Заполнение пропусков
df.loc[df['average_rating'].isnull(), 'average_rating'] = random_ratings


[3.04 3.75 3.52 3.56 3.82 3.68 3.01 4.34 3.33 3.05 4.44 3.92 3.54 3.31
 4.32 3.78 3.12 3.77 3.93 4.27 4.73 4.44 3.5  4.98 3.06 4.73 4.99 4.11
 3.39 4.2  3.31 4.37 3.63 3.29]


In [15]:
# маска. Количество пропусков. Избавились ли от пропусков в столбце average_rating?
null_d=df.isnull().sum()
print(null_d[null_d>0])

authors       65
categories    35
dtype: int64


In [16]:
# Пожертвуем 65 строками с отсутствующими авторами
# Удаляем строки, имеющие пропуски в столбце authors       
df = df.dropna(subset=['authors'])

# маска. Количество пропусков
null_d=df.isnull().sum()
print(null_d[null_d>0])

categories    35
dtype: int64


### Сведем 567 уникальных категорий книг к 19 крупным категориям 

- Фантастика (Fantasy)
- Ужасы (Horror)
- Роман (Romance)
- Детектив (Mystery)
- Приключения (Adventure)
- Триллеры (Thriller)
- Мистика (Paranormal)
- Драма (Drama)
- Поэзия (Poetry)
- Биография (Biography)
- Саморазвитие (Self-help)
- Психология (Psychology)
- Экономика (Economics)
- История (History)
- Политика (Politics)
- Бизнес (Business)
- Кулинария (Cookbook)
- Технологии (Technology)
- Искусство и культура (Art & Culture)

In [18]:
df.sample()

Unnamed: 0,title,authors,categories,description,average_rating
2335,A Map of the World,Jane Hamilton,Fiction,"While under the care of Alice Goodwin, a neighbor's child drowns in the Goodwins' pond, a devastating accident that has profound repercussions for the entire Goodwin family, in a story set in a small Midwestern farm town",3.8


In [73]:
import torch
print(torch.cuda.is_available())     # должно быть True
# print(torch.cuda.get_device_name(0)) # название видеокарты

True


In [20]:
device = 0 if torch.cuda.is_available() else -1
print(device)

0


In [21]:
# Создадим словарь категорий
major_categories = [
    "Fantasy",
    "Horror",
    "Romance",
    "Mystery",
    "Adventure",
    "Thriller",
    "Paranormal",
    "Drama",
    "Poetry",
    "Biography",
    "Self-help",
    "Psychology",
    "Economics",
    "History",
    "Politics",
    "Business",
    "Cookbook",
    "Technology",
    "Art & Culture"
]

In [None]:
# Инициализируем zero-shot классификатор. 
# "facebook/bart-large-mnli" - предварительно обученная модель от Meta AI, основанная на архитектуре BART.
from transformers import pipeline

# # Тяжелая модель 1 батч (128) = 43сек:
# classifier = pipeline(
#     "zero-shot-classification",
#     model="facebook/bart-large-mnli",
#     device=device
# )


# # Легче модель 1 батч (128) = 22сек:
# classifier = pipeline(
#     "zero-shot-classification",
#     model="valhalla/distilbart-mnli-12-1",
#     device=device 
# )


# # # Легче модель 1 батч (128) = 38 сек:
# classifier = pipeline(
#     "zero-shot-classification",
#     model="MoritzLaurer/deberta-v3-base-zeroshot-v1",
#     device=device  # GPU
# )



# Тяжелая модель 1 батч (128) = 59 сек:
classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli",
    device=device
)

config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/870M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/395 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [None]:
# Подбор категории по двум столбцам:


def map_category_with_description(row):
    # Объединяем информацию из двух столбцов
    input_text = f"{row['categories']}. {row['description']}"
    
    # Классифицируем объединённый текст
    result = classifier(input_text, major_categories)
    
    # Возвращаем наиболее вероятный жанр
    return result['labels'][0]


In [None]:
# # Проверка скорости на 1 батче

# batch_size = 32
# texts = [f"{row['categories']}. {row['description']}" for _, row in df.iterrows()]
# ## texts = [f"{row['categories']}" for _, row in df.iterrows()]

# # Берём только первый батч
# batch = texts[:batch_size]

# # Классифицируем первый батч
# results = classifier(batch, candidate_labels=major_categories)

# # Пример вывода первой метки из результатов
# print(results[0]['labels'][0])




Art & Culture




In [None]:
# Подготовка текстов
texts = [f"{row['categories']}. {row['description']}" for _, row in df.iterrows()]

In [28]:
# Обработка батчами
batch_size = 32
results = []

In [29]:
# Для 2 столбцов

from tqdm import tqdm


for i in tqdm(range(0, len(texts), batch_size), desc="Выполняется..."):
    batch = texts[i:i + batch_size]
    outputs = classifier(batch, candidate_labels=major_categories)
    
    # Вариант для batched pipeline: список результатов
    if isinstance(outputs, list):
        for out in outputs:
            results.append(out["labels"][0])
    else:  # Вариант на 1 строку
        results.append(outputs["labels"][0])

Выполняется...:   5%|▍         | 9/198 [02:04<43:51, 13.92s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Выполняется...: 100%|██████████| 198/198 [44:17<00:00, 13.42s/it]


In [30]:
# Сохраняем результат
df["categories_new"] = results

In [31]:
df.to_csv("Book_recom_small+new_categ.csv", index=False)

In [32]:
df.head(1)

Unnamed: 0,title,authors,categories,description,average_rating,categories_new
0,Gilead,Marilynne Robinson,Fiction,"A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst the world has to offer. At its heart is a tale of the sacred bonds between fathers and sons, pitch-perfect in style and story, set to dazzle critics and readers alike.",3.85,Art & Culture


In [33]:
# Сохраняем как Excel
df.to_excel("Book_recom_small+new_categ.xlsx", index=False)

In [None]:
# Удалим столбец со старыми жанрами

df = df.drop(columns=["categories"])

### Вернем колонку со ссылками на картинки

In [48]:
# Загрузим изначальный датасет
processed_data = pd.read_csv('1/Books.csv')

df_2 = pd.DataFrame(processed_data)

In [49]:
df_2.sample(1)

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
5435,9780872201682,872201686,"On Justice, Power, and Human Nature",The Essence of Thucydides' History of the Peloponnesian War,Thucydides,History,http://books.google.com/books/content?id=j2VgQgAACAAJ&printsec=frontcover&img=1&zoom=1&source=gbs_api,"Designed for students with little or no background in ancient Greek language and culture, this collection of extracts from The History of the Peloponnesian War includes those passages that shed most light on Thucydides' political theory--famous as well as important but lesser-known pieces frequently overlooked by nonspecialists. Newly translated into spare, vigorous English, and situated within a connective narrative framework, Woodruff's selections will be of special interest to instructors in political theory and Greek civilization. Includes maps, notes, glossary.",1993.0,3.72,172.0,567.0


In [50]:
df.sample(1)

Unnamed: 0,title,authors,description,average_rating,categories_new
1975,Dream Makers,Nora Roberts,"Lion trainer Jo Wilder finds herself falling for Keane Prescott, a lawyer who inherits ownership of the circus; and, Megan Miller finds herself falling for David Katcherton, a sexy stranger who offers to buy her grandfather's amusement park.",3.81,Fantasy


In [51]:
# Добавим столбец с ссылками на изображения по столбам title и authors
df_small_finish = df.merge(
    df_2[["title", "authors", "description", "thumbnail"]],
    on=["title", "authors", "description"],
    how="left"
)


In [52]:
df_small_finish.sample(1)

Unnamed: 0,title,authors,description,average_rating,categories_new,thumbnail
562,The Elephant Vanishes,Haruki Murakami,"When a man's favourite elephant vanishes, the balance of his whole life is subtly upset; a couple's midnight hunger pangs drive them to hold up a McDonald's; a woman finds she is irresistible to a small green monster that burrows through her front garden; an insomniac wife wakes up to a twilight world of semi-consciousness in which anything seems possible - even death. In every one of the stories that make up The Elephant Vanishes, Murakami makes a determined assault on the normal. He has a deadpan genius for dislocating realities to uncover the surreal in the everyday, the extraordinary in the ordinary.",3.86,Fantasy,http://books.google.com/books/content?id=GzP6Dzk-f8sC&printsec=frontcover&img=1&zoom=1&source=gbs_api


In [None]:
df_small_finish.to_csv("Book_recom_small_FINISH.csv", index=False)

In [53]:
df_small_finish.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6323 entries, 0 to 6322
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           6323 non-null   object 
 1   authors         6323 non-null   object 
 2   description     6323 non-null   object 
 3   average_rating  6323 non-null   float64
 4   categories_new  6323 non-null   object 
 5   thumbnail       6124 non-null   object 
dtypes: float64(1), object(5)
memory usage: 296.5+ KB


In [54]:
# маска. Количество пропусков
null_d=df_small_finish.isnull().sum()
print(null_d[null_d>0])

thumbnail    199
dtype: int64


In [55]:
# Проверка на дубликаты по столбцам 'title' и 'authors'
duplicates = df_small_finish[df_small_finish.duplicated(subset=['title', 'authors'], keep=False)]

# Вывод всех найденных дубликатов
print("Найдено дубликатов:", duplicates.shape[0])
print(duplicates)

Найдено дубликатов: 20
                             title                      authors                                                                                                                                                                                                                                                                                                                          description  average_rating categories_new                                                                                                        thumbnail
971              A Little Princess      Frances Hodgson Burnett                                                                                                                                                                                        Sara Crewe, a pupil at Miss Minchin's London school, is left in poverty when her father dies but is later rescued by a mysterious benefactor.            4.19        Mystery            http://books.

In [56]:
# Удалим дубликаты по столбцам 'title', 'authors'
df_small_finish = df_small_finish.drop_duplicates(subset=['title', 'authors'])

In [57]:
df_small_finish.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6313 entries, 0 to 6322
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           6313 non-null   object 
 1   authors         6313 non-null   object 
 2   description     6313 non-null   object 
 3   average_rating  6313 non-null   float64
 4   categories_new  6313 non-null   object 
 5   thumbnail       6114 non-null   object 
dtypes: float64(1), object(5)
memory usage: 345.2+ KB


In [58]:
# маска. Количество пропусков
null_d=df_small_finish.isnull().sum()
print(null_d[null_d>0])

thumbnail    199
dtype: int64


In [None]:
# Добавим картинки к книгам, где отстутсвуют ссылки в соответствии с жанром:

# создадим словарь
category_to_image = {
    "Fantasy": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Fantasy.jpg",
    "Horror": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Fantasy.jpg",
    "Romance": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Romance.jpg",
    "Mystery": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Mystery.jpg",
    "Adventure": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Adventure.jpg",
    "Thriller": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Thriller.jpg",
    "Paranormal": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Paranormal.jpg",
    "Drama": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Drama.jpg",
    "Poetry": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Poetry.jpg",
    "Biography": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Biography.jpg",
    "Self-help": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Self-help.jpg",
    "Psychology": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Psychology.jpg",
    "Economics": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Economics.jpg",
    "History": "https://raw.github.com/Kosty1703/pictures/main/Pictures/History.jpg",
    "Politics": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Politics.jpg",
    "Business": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Business.jpg",
    "Cookbook": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Cookbook.jpg",
    "Technology": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Technology.jpg",
    "Art & Culture": "https://raw.github.com/Kosty1703/pictures/main/Pictures/Art%20%26%20Culture.jpg"
}

In [None]:
# Подставляем ссылки из словаря только на пустые места в столбце thumbnail
df_small_finish["thumbnail"] = df_small_finish.apply(
    lambda row: category_to_image[row["categories_new"]] if pd.isna(row["thumbnail"]) else row["thumbnail"],
    axis=1
)


In [62]:
df_small_finish.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6313 entries, 0 to 6322
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           6313 non-null   object 
 1   authors         6313 non-null   object 
 2   description     6313 non-null   object 
 3   average_rating  6313 non-null   float64
 4   categories_new  6313 non-null   object 
 5   thumbnail       6313 non-null   object 
dtypes: float64(1), object(5)
memory usage: 345.2+ KB


In [63]:
# маска. Количество пропусков
null_d=df_small_finish.isnull().sum()
print(null_d[null_d>0])

Series([], dtype: int64)


In [70]:
df_small_finish.sample(1)

Unnamed: 0,title,authors,description,average_rating,categories_new,thumbnail
4965,Wonder of the World,David Lindsay-Abaire,THE STORY: Nothing will prepare you for the dirty little secret Cass discovers in her husband's sweater drawer. It is so shocking that our heroine has no choice but to flee to the honeymoon capital of the world in a frantic search for the life she,3.94,Drama,http://books.google.com/books/content?id=t1eMMBLXGd0C&printsec=frontcover&img=1&zoom=1&source=gbs_api


In [None]:
# Переименуем столбцы в один формат со вторым датасетом для последующего слияния.
df_small_finish = df_small_finish.rename(columns={"title": "Title"})
df_small_finish = df_small_finish.rename(columns={"authors": "Authors"})
df_small_finish = df_small_finish.rename(columns={"description": "Description"})
df_small_finish = df_small_finish.rename(columns={"categories_new": "Category"})
df_small_finish = df_small_finish.rename(columns={"average_rating": "Rating"})
df_small_finish = df_small_finish.rename(columns={"thumbnail": "Link"})
df_small_finish.sample(1)

Unnamed: 0,Title,Authors,Description,Rating,Category,Link
3533,Two of Us,Peter Smith,"The author of A Good Family offers poignant, entertaining account of how his and his son's mutual love for the music of the Beatles sparked a closer relationship, describing how they used the songs and exploits of the Fab Four to spark discussions of such topics as friendship, teamwork, art, sorrow, failure, and mortalitiy.",3.53,Biography,http://books.google.com/books/content?id=GhRKYgEACAAJ&printsec=frontcover&img=1&zoom=1&source=gbs_api


In [72]:
df_small_finish.to_csv("Book_recom_small_FINISH.csv", index=False)