Carregando Bibliotecas

In [1]:
# importando módulos
import pandas as pd
import numpy as np
from datetime import date
import json
import pprint

# importando funções auxiliares
from genre_code import genre_code
from trees import id3

Definindo os Dataframes

In [2]:
# gerando os dataframes users, movies, ratings
users = pd.read_csv('users.csv', sep=',')
movies = pd.read_csv('movies.csv', sep=';')
ratings = pd.read_csv('ratings.csv', sep=';')


Tratando o dataframe: users

In [3]:
# deletando colunas inúteis para a análise
users = users.drop(["name", "Zip-code"], axis=1, errors='ignore')

# printando o dataframe users
print(users)


      UserID Gender  Occupation    birthday
0          1      F          10    4/1/2012
1          2      M          16   9/21/1964
2          3      M          15    4/3/1995
3          4      M           7    5/8/1974
4          5      M          20   6/18/1996
...      ...    ...         ...         ...
6035    6036      F          15  10/17/1995
6036    6037      F           1   6/12/1975
6037    6038      F           1   1/17/1963
6038    6039      F           0   10/6/1977
6039    6040      M           6    4/8/1988

[6040 rows x 4 columns]


Extraindo o ano das datas de nascimento para facilitar a obtenção da idade


In [4]:
def splitIntoNumbers(string):
    # a 3° posição dos string gerado retornará o ano de nascimento
    return string.split('/')[2] 

# veja que não importa muito uma diferença de 1 ano nas datas de nascimento, que seria ocasionada pelo fato de a pessoa fazer aniversário antes ou depois de hoje. Pulamos essa parte porque não adicionaria muito à análise e apenas dificultaria o código.
users['birthday'] = users['birthday'].apply(splitIntoNumbers)

print(users)

      UserID Gender  Occupation birthday
0          1      F          10     2012
1          2      M          16     1964
2          3      M          15     1995
3          4      M           7     1974
4          5      M          20     1996
...      ...    ...         ...      ...
6035    6036      F          15     1995
6036    6037      F           1     1975
6037    6038      F           1     1963
6038    6039      F           0     1977
6039    6040      M           6     1988

[6040 rows x 4 columns]


Calculando a idade de cada usuário baseado na data de nascimento

In [5]:
def calculateAge(born):
    today = date.today()
    return today.year - int(born)

users['age'] = users['birthday'].apply(calculateAge)

users = users.drop('birthday', axis = 1, errors='ignore')

Imprimindo o dataframe users tratado:

In [6]:
print(users)

      UserID Gender  Occupation  age
0          1      F          10   10
1          2      M          16   58
2          3      M          15   27
3          4      M           7   48
4          5      M          20   26
...      ...    ...         ...  ...
6035    6036      F          15   27
6036    6037      F           1   47
6037    6038      F           1   59
6038    6039      F           0   45
6039    6040      M           6   34

[6040 rows x 4 columns]


Trocando as idades para 'categorias', segundo como descrito no pdf

In [7]:
def classifyAges(age):
    if(age<18): age = 1
    elif (age<25): age = 18
    elif (age<35): age = 25
    elif (age<45): age = 35
    elif (age<50): age = 45
    elif (age<56): age = 50
    else: age = 56

    return age

users['age'] = users['age'].apply(classifyAges)

In [8]:
print(users)

      UserID Gender  Occupation  age
0          1      F          10    1
1          2      M          16   56
2          3      M          15   25
3          4      M           7   45
4          5      M          20   25
...      ...    ...         ...  ...
6035    6036      F          15   25
6036    6037      F           1   45
6037    6038      F           1   56
6038    6039      F           0   45
6039    6040      M           6   25

[6040 rows x 4 columns]


Tratando o dataframe: movies

In [9]:
# printando o dataframe movies
print(movies)

      MovieID                               Title  \
0           1                    Toy Story (1995)   
1           2                      Jumanji (1995)   
2           3             Grumpier Old Men (1995)   
3           4            Waiting to Exhale (1995)   
4           5  Father of the Bride Part II (1995)   
...       ...                                 ...   
3878     3948             Meet the Parents (2000)   
3879     3949          Requiem for a Dream (2000)   
3880     3950                    Tigerland (2000)   
3881     3951             Two Family House (2000)   
3882     3952               Contender, The (2000)   

                            Genres Unnamed: 3  
0      Animation|Children's|Comedy        NaN  
1     Adventure|Children's|Fantasy        NaN  
2                   Comedy|Romance        NaN  
3                     Comedy|Drama        NaN  
4                           Comedy        NaN  
...                            ...        ...  
3878                       

Excluindo a coluna 'Unnamed: 3'

In [10]:
# essa coluna foi gerada porque havia um ';' a mais no arquivo csv, que fazia o programa associar a uma coluna nova
movies = movies.drop('Unnamed: 3', axis = 1, errors = 'ignore')

print(movies)

      MovieID                               Title  \
0           1                    Toy Story (1995)   
1           2                      Jumanji (1995)   
2           3             Grumpier Old Men (1995)   
3           4            Waiting to Exhale (1995)   
4           5  Father of the Bride Part II (1995)   
...       ...                                 ...   
3878     3948             Meet the Parents (2000)   
3879     3949          Requiem for a Dream (2000)   
3880     3950                    Tigerland (2000)   
3881     3951             Two Family House (2000)   
3882     3952               Contender, The (2000)   

                            Genres  
0      Animation|Children's|Comedy  
1     Adventure|Children's|Fantasy  
2                   Comedy|Romance  
3                     Comedy|Drama  
4                           Comedy  
...                            ...  
3878                        Comedy  
3879                         Drama  
3880                         D

Separando title e year:

In [11]:
movies[['title', 'year', 'ph1', 'ph2']] = movies['Title'].str.split('(', expand=True)

movies = movies.drop(['Title', 'ph1', 'ph2'], axis = 1, errors='ignore')

movies['year'] = movies['year'].str.replace(')','') # remove o caractere ')' que sobra

print(movies)

      MovieID                        Genres                         title  \
0           1   Animation|Children's|Comedy                    Toy Story    
1           2  Adventure|Children's|Fantasy                      Jumanji    
2           3                Comedy|Romance             Grumpier Old Men    
3           4                  Comedy|Drama            Waiting to Exhale    
4           5                        Comedy  Father of the Bride Part II    
...       ...                           ...                           ...   
3878     3948                        Comedy             Meet the Parents    
3879     3949                         Drama          Requiem for a Dream    
3880     3950                         Drama                    Tigerland    
3881     3951                         Drama             Two Family House    
3882     3952                Drama|Thriller               Contender, The    

      year  
0     1995  
1     1995  
2     1995  
3     1995  
4     1995

  movies['year'] = movies['year'].str.replace(')','') # remove o caractere ')' que sobra


Agora, vamos atribuir a cada combinação de gêneros um valor. Atribuiremos potências de dois, pois a soma de 3 potências diferentes 2^n nunca pode ser igual.

In [12]:
# # PH1, PH2 e PH3 são Placeholders 
# movies[['Genre 1','Genre 2','Genre 3','PH1','PH2','PH3']] = movies['Genres'].str.split('|', expand = True)

# movies = movies.drop(['Genres','PH1','PH2','PH3'], axis=1, errors='ignore')

movies['Genres'] = movies['Genres'].apply(lambda x: x.split('|'))
movies['Genres'] = movies['Genres'].apply(lambda x: genre_code(x))

print(movies)


      MovieID  Genres                         title  year
0           1      28                    Toy Story   1995
1           2     266                      Jumanji   1995
2           3    8208             Grumpier Old Men   1995
3           4     144            Waiting to Exhale   1995
4           5      16  Father of the Bride Part II   1995
...       ...     ...                           ...   ...
3878     3948      16             Meet the Parents   2000
3879     3949     128          Requiem for a Dream   2000
3880     3950     128                    Tigerland   2000
3881     3951     128             Two Family House   2000
3882     3952   32896               Contender, The   2000

[3883 rows x 4 columns]


Tratando o dataframe: ratings

In [13]:
# printando o dataframe ratings
print(ratings)

         UserID  MovieID  Rating  Timestamp
0             1     1193       5  978300760
1             1      661       3  978302109
2             1      914       3  978301968
3             1     3408       4  978300275
4             1     2355       5  978824291
...         ...      ...     ...        ...
1000204    6040     1091       1  956716541
1000205    6040     1094       5  956704887
1000206    6040      562       5  956704746
1000207    6040     1096       4  956715648
1000208    6040     1097       4  956715569

[1000209 rows x 4 columns]


Retirando o dado Timestamp, que não é útil para nossa análise

In [14]:
ratings = ratings.drop('Timestamp', axis=1, errors='ignore')

print(ratings)

         UserID  MovieID  Rating
0             1     1193       5
1             1      661       3
2             1      914       3
3             1     3408       4
4             1     2355       5
...         ...      ...     ...
1000204    6040     1091       1
1000205    6040     1094       5
1000206    6040      562       5
1000207    6040     1096       4
1000208    6040     1097       4

[1000209 rows x 3 columns]


Agora, vamos fundir os dataframes 'ratings' e 'users'

In [15]:
ru = pd.merge(users, ratings, on=['UserID'])    # ru de 'ratings' e 'users'
print(ru)

         UserID Gender  Occupation  age  MovieID  Rating
0             1      F          10    1     1193       5
1             1      F          10    1      661       3
2             1      F          10    1      914       3
3             1      F          10    1     3408       4
4             1      F          10    1     2355       5
...         ...    ...         ...  ...      ...     ...
1000204    6040      M           6   25     1091       1
1000205    6040      M           6   25     1094       5
1000206    6040      M           6   25      562       5
1000207    6040      M           6   25     1096       4
1000208    6040      M           6   25     1097       4

[1000209 rows x 6 columns]


Fundindo 'ru' e 'movies':

In [16]:
rum = pd.merge(ru, movies, on=['MovieID'])

rum = rum.drop('UserID', axis = 1, errors = 'ignore')

print(rum)

        Gender  Occupation  age  MovieID  Rating  Genres  \
0            F          10    1     1193       5     128   
1            M          16   56     1193       5     128   
2            M          12   25     1193       4     128   
3            M           7   25     1193       4     128   
4            M           1   50     1193       5     128   
...        ...         ...  ...      ...     ...     ...   
1000204      M          17   18     2198       5      64   
1000205      M          14   35     2703       3     128   
1000206      M          17   18     2845       1     128   
1000207      F          20   18     3607       5  131216   
1000208      M           1   25     2909       4      64   

                                         title  year  
0             One Flew Over the Cuckoo's Nest   1975  
1             One Flew Over the Cuckoo's Nest   1975  
2             One Flew Over the Cuckoo's Nest   1975  
3             One Flew Over the Cuckoo's Nest   1975  
4   

In [18]:
tree = id3(rum.head(100), 'Rating')
pprint.pprint(tree)

[5 4 3 2 1]


  total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row) #entropy of the class
  total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row) #entropy of the class


KeyError: None