In [None]:
# Data Mining and Machine Learning Techniques
# K-Nearest Neighbors - concepts (KNN)
# dimensionality reduction
# Principal Component Analysis (PCA)
#
# We'll then walk through the concept of data warehousing
# ELT process over the ETL process
# Reinforcement Learning
#
# ----------------------------------------------------------
# The concept of k-nearest neighbors
# Implementation of KNN to predict the rating of a movie
# Dimensionality reduction and principal component analysis
# Example of PCA with the Iris dataset
# Data warehousing and ETL versus ELT
# What is reinforcement learning
# The working behind the intelligent Pac-Man game
# Some fancy words used for reinforcement learning


#
# p. 242 (p. 257 of 415)
# KNN.ipynb

In [1]:
import pandas as pd

r_cols = ['user_id','movie_id','rating']
ratings = pd.read_csv('C:/Users/pcpow/OneDrive/Desktop/DataScience_Udemy_20230321/u.data', sep='\t', names=r_cols, usecols=range(3))
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


In [3]:
import numpy as np

movieProperties = ratings.groupby('movie_id').agg({'rating':[np.size, np.mean]})
movieProperties.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


In [4]:
ratings.to_excel(r'C:\Users\pcpow\OneDrive\Desktop\Ratings.xlsx', index=False)

In [6]:
# This is basically a measure of popularity for each movie, on a scale of 0 to 1.
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedNumRatings.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985
3,0.152659
4,0.356775
5,0.145798


In [19]:
movieDict={}
with open(r'C:/Users/pcpow/OneDrive/Desktop/DataScience_Udemy_20230321/u.item') as f:
    temp = ''
    for line in f:
        #line.decode("ISO-8859-1")
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:25]
        genres = map(int, genres)
        movieDict[movieID] = (name, np.array(list(genres)), movieNormalizedNumRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean'))

In [11]:
# m_cols = ['movie_id', 'title', 'genre']
# movieNamesRatings = pd.read_csv('C:/Users/pcpow/OneDrive/Desktop/DataScience_Udemy_20230321/u.item', sep='|', names=m_cols, usecols=range(3), encoding="ISO-8859-1")
# movieNamesRatings.to_excel(r'C:\Users\pcpow\OneDrive\Desktop\movieNamesRatings.xlsx', index=False)

In [24]:
movieDict[1]

('Toy Story (1995)',
 array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 0.7735849056603774,
 3.8783185840707963)

In [26]:
from scipy import spatial

def ComputeDistance(a, b):
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB)
    
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance

ComputeDistance(movieDict[2], movieDict[4])


0.8004574042309892

In [27]:
# Remember, a far distance means it's not similar (Scale = 0 to 1)
print(movieDict[2])
print(movieDict[4])

('GoldenEye (1995)', array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]), 0.22298456260720412, 3.2061068702290076)
('Get Shorty (1995)', array([0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.3567753001715266, 3.550239234449761)


In [32]:
import operator

def getNeighbors(movieID, K):
    distances = []
    for movie in movieDict:
        if (movie != movieID):
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

#=========================================================================

K = 10
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))
    
avgRating /= float(K)
    
    
    
    
    

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Aladdin and the King of Thieves (1996) 2.8461538461538463


In [34]:
avgRating

3.3445905900235564

In [35]:
K

10

In [40]:
# https://www.tutorialspoint.com/python/assignment_operators_example.htm

a = 21
b = 10
c = 0

c= 1092

c /= a 
"Line 4 - Value of c is ", c 

('Line 4 - Value of c is ', 52.0)

In [41]:
K = 5
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))
    
avgRating /= float(K)

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127


In [42]:
avgRating

3.7189656165466287

In [43]:
K = 20
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))
    
avgRating /= float(K)

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Aladdin and the King of Thieves (1996) 2.8461538461538463
Lion King, The (1994) 3.7818181818181817
Jungle2Jungle (1997) 2.4393939393939394
Babe (1995) 3.9954337899543377
Wrong Trousers, The (1993) 4.466101694915254
Raising Arizona (1987) 3.875
Beauty and the Beast (1991) 3.792079207920792
Back to the Future (1985) 3.834285714285714
101 Dalmatians (1996) 2.908256880733945
Fish Called Wanda, A (1988) 3.785425101214575
Pinocchio (1940) 3.6732673267326734


In [44]:
avgRating

3.4998483868602484