# Movie Ratings Prediction using K-Nearest Neighbors (KNN)

In [2]:
#Import Necessary Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set visualizations
sns.set(style="whitegrid")

In [6]:
# Load datasets
df_links = pd.read_csv(r'C:\Users\Vishnu\Downloads\KNN\links.csv')
df_movies = pd.read_csv(r'C:\Users\Vishnu\Downloads\KNN\movies.csv')
df_ratings = pd.read_csv(r'C:\Users\Vishnu\Downloads\KNN\ratings.csv')
df_tags = pd.read_csv(r'C:\Users\Vishnu\Downloads\KNN\tags.csv')

# Display the first few rows of each dataframe
print("Links DataFrame:")
print(df_links.head())
print("\nMovies DataFrame:")
print(df_movies.head())
print("\nRatings DataFrame:")
print(df_ratings.head())
print("\nTags DataFrame:")
print(df_tags.head())

Links DataFrame:
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0

Movies DataFrame:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings DataFrame:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  9

In [8]:
# Merge ratings with movies
df_merged = pd.merge(df_ratings, df_movies, on='movieId')

# Merge tags with the already merged DataFrame
df_merged = pd.merge(df_merged, df_tags, on=['userId', 'movieId'], how='left')

# Display the merged DataFrame
print("\nMerged DataFrame:")
print(df_merged.head())



Merged DataFrame:
   userId  movieId  rating  timestamp_x             title  \
0       1        1     4.0    964982703  Toy Story (1995)   
1       5        1     4.0    847434962  Toy Story (1995)   
2       7        1     4.5   1106635946  Toy Story (1995)   
3      15        1     2.5   1510577970  Toy Story (1995)   
4      17        1     4.5   1305696483  Toy Story (1995)   

                                        genres  tag  timestamp_y  
0  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  
1  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  
2  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  
3  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  
4  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  


In [10]:
# Check for missing values
print("\nMissing Values:")
print(df_merged.isnull().sum())

# Fill missing tags with an empty string
df_merged['tag'].fillna('', inplace=True)

# Optional: Create a new column combining tags (if needed for KNN)
df_merged['tags_combined'] = df_merged.groupby('movieId')['tag'].transform(lambda x: ' '.join(x))

# Drop unnecessary columns
df_cleaned = df_merged[['userId', 'movieId', 'title', 'genres', 'rating', 'tags_combined']]

# Display the cleaned DataFrame
print("\nCleaned DataFrame:")
print(df_cleaned.head())



Missing Values:
userId             0
movieId            0
rating             0
timestamp_x        0
title              0
genres             0
tag            99201
timestamp_y    99201
dtype: int64

Cleaned DataFrame:
   userId  movieId             title  \
0       1        1  Toy Story (1995)   
1       5        1  Toy Story (1995)   
2       7        1  Toy Story (1995)   
3      15        1  Toy Story (1995)   
4      17        1  Toy Story (1995)   

                                        genres  rating  \
0  Adventure|Animation|Children|Comedy|Fantasy     4.0   
1  Adventure|Animation|Children|Comedy|Fantasy     4.0   
2  Adventure|Animation|Children|Comedy|Fantasy     4.5   
3  Adventure|Animation|Children|Comedy|Fantasy     2.5   
4  Adventure|Animation|Children|Comedy|Fantasy     4.5   

                                       tags_combined  
0                                                ...  
1                                                ...  
2                          

In [12]:
#feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

# TF-IDF for tags
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_cleaned['tags_combined'])

# One-hot encoding for genres
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df_cleaned['genres'].str.split('|'))

# Combine the TF-IDF matrix and genres
import numpy as np
X = np.hstack((tfidf_matrix.toarray(), genres_encoded))

# Target variable (ratings)
y = df_cleaned['rating'].values

# Display the feature matrix and target
print("\nFeature Matrix Shape:", X.shape)
print("Target Variable Shape:", y.shape)



Feature Matrix Shape: (102677, 1728)
Target Variable Shape: (102677,)


In [14]:
#KNN Model Training
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize KNN model
knn_model = KNeighborsRegressor(n_neighbors=5)

# Fit the model
knn_model.fit(X_train, y_train)

# Predict on the test set
y_pred = knn_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("\nMean Squared Error:", mse)



Mean Squared Error: 1.0908010323334634


In [16]:
# Sample movie information for prediction
new_movie_info = {
    'title': 'Sample Movie',
    'genres': 'Action|Adventure',
    'tags': 'exciting|thrilling|fun'
}

# Function to preprocess input data
def preprocess_input(movie_info):
    # Create a DataFrame
    input_df = pd.DataFrame([movie_info])
    
    # Process the genres
    input_df['genres'] = input_df['genres'].str.split('|')
    genres_encoded = mlb.transform(input_df['genres'])
    
    # Process the tags using TF-IDF
    input_df['tags_combined'] = input_df['tags']
    tfidf_input = tfidf_vectorizer.transform(input_df['tags_combined'])
    
    # Combine the TF-IDF matrix and one-hot encoded genres
    input_features = np.hstack((tfidf_input.toarray(), genres_encoded))
    
    return input_features

# Preprocess the new input
new_input_features = preprocess_input(new_movie_info)

# Predict the rating for the new movie
predicted_rating = knn_model.predict(new_input_features)

# Display the predicted rating
print("Predicted Rating:", predicted_rating[0])

Predicted Rating: 2.9
