In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as colors

from sklearn.cluster import KMeans

In [2]:
#consisit of ratings
rating_df = pd.read_csv('rating.csv')

#consist of movie titles
movies_df = pd.read_csv('movies.csv')

In [3]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
df_ratings_movies = pd.merge(rating_df, movies_df, on="movieId")
df_ratings_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,3,296,5.0,1439474476,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2,4,296,4.0,1573938898,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,5,296,4.0,830786155,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
4,7,296,4.0,835444730,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller


In [6]:
df = df_ratings_movies[['userId', 'rating', 'genres']]
df.head()

Unnamed: 0,userId,rating,genres
0,1,5.0,Comedy|Crime|Drama|Thriller
1,3,5.0,Comedy|Crime|Drama|Thriller
2,4,4.0,Comedy|Crime|Drama|Thriller
3,5,4.0,Comedy|Crime|Drama|Thriller
4,7,4.0,Comedy|Crime|Drama|Thriller


In [7]:
genres_list = list(df['genres'].str.split('|', expand=True).stack().unique())
# genres_list = ['Comedy', 'Crime', 'Drama', 'Thriller', 'War', 'Musical', 'Romance', 'Adventure', 'Film-Noir', 'Sci-Fi', 'Western', 'Fantasy', 'Mystery', 'Children', 'Action', 'Documentary', 'Animation', 'Horror', 'IMAX', '(no genres listed)']

In [8]:
def generate_binary_genres(genre):
  single_genre_df = pd.DataFrame(df.iloc[:,2])
  single_genre_df.columns = [genre]
  single_genre_df.iloc[:,0] = single_genre_df.iloc[:,0].apply(lambda x: '1' if genre in x else 0)
  return single_genre_df

df_all_genres = df.copy()

for genre_name in genres_list:
  result = generate_binary_genres(genre_name)
  df_all_genres = df_all_genres.join(result)

df_all_genres.head()

Unnamed: 0,userId,rating,genres,Comedy,Crime,Drama,Thriller,War,Musical,Romance,...,Western,Fantasy,Mystery,Children,Action,Documentary,Animation,Horror,IMAX,(no genres listed)
0,1,5.0,Comedy|Crime|Drama|Thriller,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,5.0,Comedy|Crime|Drama|Thriller,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,4.0,Comedy|Crime|Drama|Thriller,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,4.0,Comedy|Crime|Drama|Thriller,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,4.0,Comedy|Crime|Drama|Thriller,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# We have splitted and one-hot encoded the genres. Remove the original column.
df_for_k_means = df_all_genres.drop(columns=['genres', 'userId']).fillna(0)
df_for_k_means.head()

Unnamed: 0,rating,Comedy,Crime,Drama,Thriller,War,Musical,Romance,Adventure,Film-Noir,...,Western,Fantasy,Mystery,Children,Action,Documentary,Animation,Horror,IMAX,(no genres listed)
0,5.0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4.0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
def k_means_optimizer(data, maximum_iterations):
    k_means_results = []
    mean_squared_errors = []
    
    for i in range(1,maximum_iterations):
        kmeans = KMeans(n_clusters=i, n_init="auto")
        kmeans.fit(data)
        k_means_results.append(i)
        mean_squared_errors.append(kmeans.inertia_)
        
    fig = plt.subplots(figsize=(9, 6))
    plt.plot(k_means_results, mean_squared_errors, 'o-')
    plt.xlabel("Number of Clusters")
    plt.ylabel("Mean Squared Error")
    plt.grid(True)
    plt.show()

In [12]:
# Create model for KMeans
kmeans = KMeans(n_clusters=20, n_init="auto")

# Use dataset to fit the model
kmeans.fit(df_for_k_means)

# The data must be assigned back to the input df to clearly see the groups
df_for_k_means['KMeans'] = kmeans.labels_

df_for_k_means.head()

Unnamed: 0,rating,Comedy,Crime,Drama,Thriller,War,Musical,Romance,Adventure,Film-Noir,...,Fantasy,Mystery,Children,Action,Documentary,Animation,Horror,IMAX,(no genres listed),KMeans
0,5.0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
1,5.0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
2,4.0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
3,4.0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
4,4.0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14


In [13]:
k_means_df = df.join(df_for_k_means.iloc[:,-1])
k_means_df.head()

Unnamed: 0,userId,rating,genres,KMeans
0,1,5.0,Comedy|Crime|Drama|Thriller,14
1,3,5.0,Comedy|Crime|Drama|Thriller,14
2,4,4.0,Comedy|Crime|Drama|Thriller,14
3,5,4.0,Comedy|Crime|Drama|Thriller,14
4,7,4.0,Comedy|Crime|Drama|Thriller,14


In [14]:
final_df = k_means_df.join(df_ratings_movies["movieId"])
final_df.head()

Unnamed: 0,userId,rating,genres,KMeans,movieId
0,1,5.0,Comedy|Crime|Drama|Thriller,14,296
1,3,5.0,Comedy|Crime|Drama|Thriller,14,296
2,4,4.0,Comedy|Crime|Drama|Thriller,14,296
3,5,4.0,Comedy|Crime|Drama|Thriller,14,296
4,7,4.0,Comedy|Crime|Drama|Thriller,14,296


In [15]:
final_df = final_df[['userId', 'movieId','rating','genres','KMeans']]
final_df.head()

Unnamed: 0,userId,movieId,rating,genres,KMeans
0,1,296,5.0,Comedy|Crime|Drama|Thriller,14
1,3,296,5.0,Comedy|Crime|Drama|Thriller,14
2,4,296,4.0,Comedy|Crime|Drama|Thriller,14
3,5,296,4.0,Comedy|Crime|Drama|Thriller,14
4,7,296,4.0,Comedy|Crime|Drama|Thriller,14


In [16]:
final_df.to_csv("k_means_ratings.csv", index=False)

In [17]:
final_df

Unnamed: 0,userId,movieId,rating,genres,KMeans
0,1,296,5.0,Comedy|Crime|Drama|Thriller,14
1,3,296,5.0,Comedy|Crime|Drama|Thriller,14
2,4,296,4.0,Comedy|Crime|Drama|Thriller,14
3,5,296,4.0,Comedy|Crime|Drama|Thriller,14
4,7,296,4.0,Comedy|Crime|Drama|Thriller,14
...,...,...,...,...,...
999995,6729,177617,4.0,Action|Thriller,2
999996,6730,77316,4.5,Drama|War,4
999997,6730,86002,5.0,Drama,4
999998,6730,99708,4.0,Drama,4
