In [19]:
import pandas as pd # загружаем необходимые библиотеки
import numpy as np
import matplotlib.pyplot as plt
import copy
import seaborn as sns

from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from keras import models
from keras import layers
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore") # Отключаем некритические уведомления

In [37]:
dfmovies = pd.read_csv('movies.csv')

# **ИЗУЧИМ ДАННЫЕ**

In [38]:
dfmovies.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


# ПОСМОТРИМ НА СПИСОК ФИЛЬМОВ

In [39]:
dfmovies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


***Посмотрим на уникальные жанры...***

In [40]:
genresList = []
for i in range (0,len(dfmovies)):
  genresList += (dfmovies['genres'][i].split('|'))
genresList = list(set(genresList))

print(genresList, len(genresList))

['Crime', 'Musical', 'Film-Noir', 'Comedy', 'Thriller', 'Mystery', 'Horror', 'Documentary', 'IMAX', 'Western', 'Romance', 'Drama', 'War', '(no genres listed)', 'Children', 'Animation', 'Sci-Fi', 'Fantasy', 'Adventure', 'Action'] 20


***В этом датасете присутствуют строчки с фильмами, к которым не указаны жанры...***

In [41]:
count = len(dfmovies[dfmovies['genres']=='(no genres listed)'])
percent = "{:0.2f}".format(len(dfmovies[dfmovies['genres']=='(no genres listed)'])/dfmovies.shape[0]*100)
print("Вот к стольким фильмам не указаны жанры: ",count," = ",percent,"%")

Вот к стольким фильмам не указаны жанры:  5062  =  8.11 %


# СЧИТАЕМ РЕЙТИНГИ ФИЛЬМОВ

In [57]:
dfratings = pd.read_csv('ratings.csv')
dfratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [58]:
dfratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


***Посмотрим, скольким фильмам поставлены оценки...***

In [59]:
movIds = dfratings['movieId']
movIds = movIds.drop_duplicates()
count = movIds.shape[0]
percent = "{:0.2f}".format(count/dfmovies.shape[0]*100)
print("Вот стольким фильмам поставлены оценки: ",count," = ",percent,"%")

Вот стольким фильмам поставлены оценки:  59047  =  94.59 %


***В этом датасете оценены не все фильмы...***

***Выведем среднюю оценку для каждого пользователя и количество фильмов, которые он оценил...***

In [60]:
userIds = dfratings['userId']
userIds = userIds.drop_duplicates()
userIds

Unnamed: 0,userId
0,1
70,2
254,3
910,4
1152,5
...,...
24999523,162537
24999624,162538
24999778,162539
24999825,162540


In [48]:
df_meanUserRate = pd.DataFrame({
    'userId':[],
    'meanRating':[],
    'numOfFilms':[]
})
df_meanUserRate

i = 0
for usId in userIds:
  meanRate = dfratings[dfratings['userId'] == usId]['rating'].mean()
  count = dfratings[dfratings['userId'] == usId].shape[0]
  df_meanUserRate.loc[i] = [str(usId), "{:0.2f}".format(meanRate),count]
  i += 1

df_meanUserRate['meanRating'] = df_meanUserRate['meanRating'].astype('float')
df_meanUserRate['userId'] = df_meanUserRate['userId'].astype('int')

df_meanUserRate

Unnamed: 0,userId,meanRating,numOfFilms
0,1,3.81,70
1,2,3.63,184
2,3,3.70,656
3,4,3.38,242
4,5,3.75,101
...,...,...,...
40157,40158,4.60,164
40158,40159,4.23,172
40159,40160,4.29,39
40160,40161,3.27,46


***Будем считать, что хороших фильмов больше, чем плохих, потому что плохие фильмы априори никто не хочет спонсировать***

In [53]:
right1 = df_meanUserRate[df_meanUserRate['meanRating']<1.5]
left1 =  df_meanUserRate[df_meanUserRate['numOfFilms']>50]
res1 = pd.merge(right1,left1, on='userId', how='inner')
res1 # Пользователи, которые ставят заниженные оценки

Unnamed: 0,userId,meanRating_x,numOfFilms_x,meanRating_y,numOfFilms_y
0,1484,1.05,869,1.05,869
1,8901,0.84,257,0.84,257
2,10231,1.43,450,1.43,450
3,13619,1.38,60,1.38,60
4,13740,1.37,112,1.37,112
5,13838,0.89,353,0.89,353
6,18751,1.2,992,1.2,992
7,26097,1.18,538,1.18,538
8,27458,1.47,199,1.47,199


In [51]:
right2 = df_meanUserRate[df_meanUserRate['meanRating']>4.5]
left2 =  df_meanUserRate[df_meanUserRate['numOfFilms']>250]
res2 = pd.merge(right2,left2, on='userId', how='inner')
res2 # Пользователи, которые ставят завышенные оценки

Unnamed: 0,userId,meanRating_x,numOfFilms_x,meanRating_y,numOfFilms_y
0,865,4.66,439,4.66,439
1,1869,4.61,452,4.61,452
2,2847,4.64,290,4.64,290
3,4332,4.55,331,4.55,331
4,4807,4.59,311,4.59,311
5,6184,4.98,875,4.98,875
6,6833,4.59,312,4.59,312
7,8154,4.63,366,4.63,366
8,8470,4.75,537,4.75,537
9,8527,4.59,288,4.59,288


***Удалим оценки этих пользователей из исходного датасета...***

In [52]:
dfratings = pd.read_csv('ratings.csv')
count1 = dfratings.shape[0]

ids = res1['userId']
for id in ids:
  dfratings = dfratings.loc[dfratings['userId'] != id]
ids = res2['userId']
for id in ids:
  dfratings = dfratings.loc[dfratings['userId'] != id]

count2 = dfratings.shape[0]
percent = "{:0.2f}".format(100 - count2/count1*100)
print("Столько оценок было удалено: ",count1-count2," = ",percent,"%")

Столько оценок было удалено:  18943  =  0.11 %


# В датасет к фильмам добавим столбец с годом создания фильма

In [307]:
df_filmYear = pd.DataFrame({
    'movieId':[],
    'year':[]
})

for i in range (0,len(dfmovies)):
  year = []
  year = dfmovies['title'][i].split(' ')
  for w in year:
    if (")" or "(") in w:
      y = w[1:-1]
      #print(y)
      #print(dfmovies['movieId'][i])
      try:
        df_filmYear.loc[i] = [dfmovies['movieId'][i], int(y)]
      except:
        y=0
        df_filmYear.loc[i] = [dfmovies['movieId'][i], int(y)]
    else:
      y=0
      df_filmYear.loc[i] = [dfmovies['movieId'][i], int(y)]


In [317]:
percent = df_filmYear[df_filmYear['year']==0].shape[0]/df_filmYear.shape[0]*100
fp = "{:0.2f}".format(percent)
print("Год создания фильма не удалось определить для", df_filmYear[df_filmYear['year']==0].shape[0],"=", fp, "% записей")

Год создания фильма не удалось определить для 572 = 0.92 % записей


***Не для всех фильмов удалось определить год создания...***

In [324]:
dfmovies1 = pd.merge(dfmovies,df_filmYear, on='movieId', how='inner')

In [325]:
dfmovies1

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995
...,...,...,...,...
62418,209157,We (2018),Drama,2018
62419,209159,Window of the Soul (2001),Documentary,2001
62420,209163,Bad Poems (2018),Comedy|Drama,2018
62421,209169,A Girl Thing (2001),(no genres listed),2001


In [344]:
maxYear = dfmovies1['year'].max(axis=0)
print("Самые свежие фильмы в таблице относятся к следующему году:",maxYear)

Самые свежие фильмы в таблице относятся к следующему году: 2019


# **СОЗДАДИМ НЕЙРОСЕТЬ ДЛЯ ГЕНЕРАЦИИ РЕКОМЕНДАЦИЙ НА ОСНОВЕ ПРЕДПОЧТЕНИЙ КОНКРЕТНОГО ПОЛЬЗОВАТЕЛЯ (*ЗАДАЧА БИНАРНОЙ КЛАССИФИКАЦИИ*)**

# СОЕДИНИМ ТАБЛИЦЫ С ФИЛЬМАМИ И РЕЙТИНГОМ

In [327]:
mergeddfs = pd.merge(dfmovies1, dfratings, left_on='movieId', right_on='movieId', how='inner')
#print(len(mergeddfs[mergeddfs['rating'].isna() == True]))
mergeddfs

Unnamed: 0,movieId,title,genres,year,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,2,3.5,1.141416e+09
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3,4.0,1.439472e+09
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,4,3.0,1.573944e+09
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,5,4.0,8.586259e+08
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,8,4.0,8.904925e+08
...,...,...,...,...,...,...,...
162134,206499,Between Two Ferns: The Movie (2019),Comedy,2019,973,3.0,1.569265e+09
162135,206805,In the Shadow of the Moon (2019),Crime|Mystery|Sci-Fi,2019,1068,3.0,1.570500e+09
162136,207309,Fractured (2019),Thriller,2019,973,3.5,1.571933e+09
162137,207309,Fractured (2019),Thriller,2019,1068,3.0,1.571610e+09


In [328]:
del mergeddfs['userId']
del mergeddfs['timestamp']

In [329]:
mergeddfs

Unnamed: 0,movieId,title,genres,year,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.5
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,4.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,4.0
...,...,...,...,...,...
162134,206499,Between Two Ferns: The Movie (2019),Comedy,2019,3.0
162135,206805,In the Shadow of the Moon (2019),Crime|Mystery|Sci-Fi,2019,3.0
162136,207309,Fractured (2019),Thriller,2019,3.5
162137,207309,Fractured (2019),Thriller,2019,3.0


# ВЫЧИСЛИМ СРЕДНИЙ РЕЙТИНГ ДЛЯ КАЖДОГО ФИЛЬМА

In [330]:
movIds = dfratings['movieId']
movIds = movIds.drop_duplicates()
print(len(movIds))


11827


In [331]:
df = pd.DataFrame({
    'movieId':[],
    'rating':[]
})
df

Unnamed: 0,movieId,rating


In [332]:
i = 0
for movId in movIds:
  meanRate = mergeddfs[mergeddfs['movieId'] == movId]['rating'].mean()
  df.loc[i] = [str(movId), "{:0.4f}".format(meanRate)]
  i += 1

df

Unnamed: 0,movieId,rating
0,296,4.1942
1,306,4.1154
2,307,4.1705
3,665,4.0000
4,899,4.0321
...,...,...
11822,43923,3.0000
11823,50158,4.0000
11824,51082,4.0000
11825,51520,2.0000


In [333]:
df_movRate = mergeddfs
del df_movRate['rating']
df_movRate

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
...,...,...,...,...
162134,206499,Between Two Ferns: The Movie (2019),Comedy,2019
162135,206805,In the Shadow of the Moon (2019),Crime|Mystery|Sci-Fi,2019
162136,207309,Fractured (2019),Thriller,2019
162137,207309,Fractured (2019),Thriller,2019


In [334]:
df['movieId'] = df['movieId'].astype('float64')

df_movRate = pd.merge(df_movRate, df, left_on='movieId', right_on='movieId', how='outer')
df_movRate


Unnamed: 0,movieId,title,genres,year,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.9344
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.9344
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.9344
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.9344
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.9344
...,...,...,...,...,...
162134,206499,Between Two Ferns: The Movie (2019),Comedy,2019,2.5000
162135,206805,In the Shadow of the Moon (2019),Crime|Mystery|Sci-Fi,2019,3.0000
162136,207309,Fractured (2019),Thriller,2019,3.2500
162137,207309,Fractured (2019),Thriller,2019,3.2500


In [335]:
df_movRate['movieId'] = df_movRate['movieId'].astype('int')
df_movRate.info()
df_movRate

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162139 entries, 0 to 162138
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   movieId  162139 non-null  int64 
 1   title    162139 non-null  object
 2   genres   162139 non-null  object
 3   year     162139 non-null  int64 
 4   rating   162139 non-null  object
dtypes: int64(2), object(3)
memory usage: 6.2+ MB


Unnamed: 0,movieId,title,genres,year,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.9344
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.9344
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.9344
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.9344
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.9344
...,...,...,...,...,...
162134,206499,Between Two Ferns: The Movie (2019),Comedy,2019,2.5000
162135,206805,In the Shadow of the Moon (2019),Crime|Mystery|Sci-Fi,2019,3.0000
162136,207309,Fractured (2019),Thriller,2019,3.2500
162137,207309,Fractured (2019),Thriller,2019,3.2500


# УДАЛИМ ДУПЛИКАТЫ ИЗ ТАБЛИЦЫ

In [336]:
df_movRate = df_movRate.drop_duplicates(subset=['movieId'])

In [337]:
df_movRate = df_movRate.reset_index()
df_movRate

Unnamed: 0,index,movieId,title,genres,year,rating
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.9344
1,404,2,Jumanji (1995),Adventure|Children|Fantasy,1995,3.3688
2,545,3,Grumpier Old Men (1995),Comedy|Romance,1995,3.2606
3,639,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,2.9231
4,652,5,Father of the Bride Part II (1995),Comedy,1995,3.2181
...,...,...,...,...,...,...
11822,162132,206272,Haunt (2019),Horror|Thriller,2019,2.5000
11823,162133,206499,Between Two Ferns: The Movie (2019),Comedy,2019,2.5000
11824,162135,206805,In the Shadow of the Moon (2019),Crime|Mystery|Sci-Fi,2019,3.0000
11825,162136,207309,Fractured (2019),Thriller,2019,3.2500


In [154]:
del df_movRate['index']
df_movRate

Unnamed: 0,movieId,title,genres,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.9344
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.3688
2,3,Grumpier Old Men (1995),Comedy|Romance,3.2606
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.9231
4,5,Father of the Bride Part II (1995),Comedy,3.2181
...,...,...,...,...
11822,206272,Haunt (2019),Horror|Thriller,2.5000
11823,206499,Between Two Ferns: The Movie (2019),Comedy,2.5000
11824,206805,In the Shadow of the Moon (2019),Crime|Mystery|Sci-Fi,3.0000
11825,207309,Fractured (2019),Thriller,3.2500


# СОБЕРЁМ ДАТАСЕТ
21 входных нейронов. 19 отвечают за жанр. Последние отвечают за нормализованный рейтинг и нормализованный год создания

In [427]:
#Mystery = 0 - ЭТО НОМЕР НЕЙРОНА. ЕСЛИ К ФИЛЬМУ ЖАНРЫ НЕ УКАЗАНЫ, ТО ВСЕ НЕЙРОНЫ БУДУТ НЕАКТИВНЫ
#War = 1
#Sci_Fi = 2
#Western = 3
#IMAX = 4
#Animation = 5
#Musical = 6
#Horror = 7
#Fantasy = 8
#Crime = 9
#Drama = 10
#Action = 11
#Adventure = 12
#Comedy = 13
#Children = 14
#Romance = 15
#Film_Noir = 16
#Documentary = 17
#Thriller = 18

dataset = []

for i in range(0,len(df_movRate)):
  genresList = []
  params_of_film = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]

  genresList = df_movRate['genres'][i].split('|')
  if 'Mystery' in genresList:
    params_of_film[0] = 1
  else: pass
  if 'War' in genresList:
    params_of_film[1] = 1
  else: pass
  if 'Sci-Fi' in genresList:
    params_of_film[2] = 1
  else: pass
  if 'Western' in genresList:
    params_of_film[3] = 1
  else: pass
  if 'IMAX' in genresList:
    params_of_film[4] = 1
  else: pass
  if 'Animation' in genresList:
    params_of_film[5] = 1
  else: pass
  if 'Musical' in genresList:
    params_of_film[6] = 1
  else: pass
  if 'Horror' in genresList:
    params_of_film[7] = 1
  else: pass
  if 'Fantasy' in genresList:
    params_of_film[8] = 1
  else: pass
  if 'Crime' in genresList:
    params_of_film[9] = 1
  else: pass
  if 'Drama' in genresList:
    params_of_film[10] = 1
  else: pass
  if 'Action' in genresList:
    params_of_film[11] = 1
  else: pass
  if 'Adventure' in genresList:
    params_of_film[12] = 1
  else: pass
  if 'Comedy' in genresList:
    params_of_film[13] = 1
  else: pass
  if 'Children' in genresList:
    params_of_film[14] = 1
  else: pass
  if 'Romance' in genresList:
    params_of_film[15] = 1
  else: pass
  if 'Film_Noir' in genresList:
    params_of_film[16] = 1
  else: pass
  if 'Documentary' in genresList:
    params_of_film[17] = 1
  else: pass
  if 'Thriller' in genresList:
    params_of_film[18] = 1
  else: pass
  params_of_film[20] = float("{:0.4f}".format(float(df_movRate['rating'][i])/5)) # рейтинг
  params_of_film[19] = float(df_movRate['year'][i]/maxYear) # год
  dataset.append(params_of_film)

In [428]:
dataset

[[0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0.9881129271916791,
  0.7869],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0.9881129271916791,
  0.6738],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0.9881129271916791,
  0.6521],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0.9881129271916791,
  0.5846],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0.9881129271916791,
  0.6436],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0.9881129271916791,
  0.7568],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0.9881129271916791,
  0.6892],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0.98811

# "НАЛАЙКАЕМ" СЕБЕ ФИЛЬМОВ ЖАНРА "ADVENTURE"/"COMEDY"/"FANTASY", ВЫПУЩЕННЫХ ПОСЛЕ 1983 ГОДА, С РЕЙТИНГОМ БОЛЬШЕ 0.6, ЧТОБЫ ОБУЧИТЬ НА НИХ НЕЙРОСЕТЬ ***(РАЗМЕТИМ ДАННЫЕ)***

In [441]:
import copy
small_dataset=copy.deepcopy(dataset)
len(small_dataset)

11827

In [442]:
my_prefernce = []
for data in small_dataset:
  if ((data[12] == 1 or data[13] == 1 or data[8] == 1) and float(data[20]) >= 0.6 and data[19]>1983/maxYear):
    data.append(1)
    my_prefernce.append(data)
  else:
    data.append(0)
    my_prefernce.append(data)

In [443]:
my_prefernce

[[0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0.9881129271916791,
  0.7869,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0.9881129271916791,
  0.6738,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0.9881129271916791,
  0.6521,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0.9881129271916791,
  0.5846,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0.9881129271916791,
  0.6436,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0.9881129271916791,
  0.7568,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0.9881129271916791,
  0.6892,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,

# СОЗДАДИМ НЕЙРОСЕТЬ

In [451]:
points = []
values = []

for dat in my_prefernce:
  points.append(dat[:-1])
  values += (dat[-1:])

dffff = pd.DataFrame(points)
dffff[21] = values
dffff

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0,0,0,0,0,1,0,0,1,0,...,1,1,1,0,0,0,0,0.988113,0.7869,1
1,0,0,0,0,0,0,0,0,1,0,...,1,0,1,0,0,0,0,0.988113,0.6738,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0.988113,0.6521,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0.988113,0.5846,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0.988113,0.6436,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11822,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1.000000,0.5000,0
11823,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1.000000,0.5000,0
11824,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1.000000,0.6000,0
11825,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1.000000,0.6500,0


In [459]:
values = dffff[21]
points = dffff.drop([21], axis=1)
train_points, test_points, train_values, test_values = train_test_split(points, values, test_size = 0.2)

nn_model = Sequential()
nn_model.add(Dense(21, activation='relu'))
nn_model.add(Dense(8, activation='relu'))
nn_model.add(Dense(1, activation='relu'))

nn_model.compile(loss='mean_absolute_error', optimizer='adam')

results = nn_model.fit(
 train_points, train_values,
 epochs= 10,
 batch_size = 50,
 validation_data = (test_points, test_values)
)

nn_predict = nn_model.predict(test_points)
print("mean_absolute_error: ", mean_absolute_error(test_values, nn_predict))

########## ИЗМЕРЯЕМ "ACCURACY" ##########

int_predict = np.array([])

for el in nn_predict:
  if el > 0.5:
    int_predict = np.append(int_predict, 1)
  else:
    int_predict = np.append(int_predict, 0)
#test_values
int_predict = pd.DataFrame(int_predict)
int_predict[0] = int_predict[0].astype('int')
#int_predict

test_values = test_values.reset_index()
del test_values['index']

count = 0
leng = len(int_predict)
for i in range(0, leng):
  if test_values[21][i] == int_predict[0][i]:
    count += 1
  else: pass
print("accuracy: ",round(count/leng*100),"%")

Epoch 1/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.2485 - val_loss: 0.2582
Epoch 2/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2488 - val_loss: 0.2583
Epoch 3/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2537 - val_loss: 0.2581
Epoch 4/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.2522 - val_loss: 0.2094
Epoch 5/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.2006 - val_loss: 0.1850
Epoch 6/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1754 - val_loss: 0.1722
Epoch 7/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.1581 - val_loss: 0.1605
Epoch 8/10
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.1494 - val_loss: 0.1532
Epoch 9/10
[1m190/190[0m [32m━━━━━━━━

***Данная нейросеть будет рекомендовать пользователю фильмы, похожие по характеристикам на те, которые он "лайкнул"***

# **СОЗДАДИМ МОДЕЛЬ ДЛЯ ПРЕДСКАЗАНИЯ РЕЙТИНГА ФИЛЬМА ПО ЕГО ХАРАКТЕРИСТИКАМ *(ЗАДАЧА РЕГРЕССИИ)***

*В разработке...*