# Code Clause Internship task 1: Movie Recommendation system
* datset taken from:https://www.kaggle.com/datasets/parasharmanas/movie-recommendation-system/

# Import the libraries

In [23]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

# Load the datasets

In [2]:
# Load movie data
movies_df = pd.read_csv('movies.csv')

# Load rating data
ratings_df = pd.read_csv('ratings.csv')

* Some basic EDA (Exploratory data analysis)

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [6]:
movies_df.isna().count()

movieId    62423
title      62423
genres     62423
dtype: int64

In [7]:
ratings_df.isna().count()

userId       25000095
movieId      25000095
rating       25000095
timestamp    25000095
dtype: int64

# Make TF-IDF for the datsets

In [8]:
# Merge movie and rating data
merged_df = pd.merge(ratings_df, movies_df, on='movieId')

# Concatenate the 'title' and 'genres' columns to create a text representation of each movie
merged_df['text'] = merged_df['title'] + ' ' + merged_df['genres']

# Split the data into training and testing sets
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer and fit it on the movie text data
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_df['text'])
test_vectors = vectorizer.transform(test_df['text'])

In [11]:
train_df['rating'].head()

19120594    5.0
15253127    4.0
12584278    3.5
18034326    4.0
18073299    4.0
Name: rating, dtype: float64

# As ratings were in float form so we had to tuen them into integer datatype

In [12]:
# Convert ratings to integers
train_df['rating'] = train_df['rating'].round().astype(int)
test_df['rating'] = test_df['rating'].round().astype(int)

* Train on Logistic Regression

In [13]:
# Train a logistic regression model
logreg = LogisticRegression()
logreg.fit(train_vectors, train_df['rating'])

# Evaluate the model on the test set
accuracy = logreg.score(test_vectors, test_df['rating'])
print(f"Accuracy: {accuracy}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.49162293183285904


# Do Parameter Tunings to maybe try to get better accuracy

In [16]:
# Define the parameter grid for cross-validation
param_grid = {'C': [0.1, 1, 10,100]}


# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(train_vectors, train_df['rating'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [17]:
# Get the best model with the tuned hyperparameters
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
accuracy = best_model.score(test_vectors, test_df['rating'])
print(f"Accuracy: {accuracy}")

Accuracy: 0.4915385321535778


# Make a function fotr the movie recommendation system
* This includes:
* Get all movies not rated by the user.
* Concatenate the 'title' and 'genres' columns to create a text representation of each movie.
* Create TF-IDF vectors for the unrated movies.
* Predict ratings for the unrated movies using the model.
* Get the indices of the top N movies with the highest predicted ratings.
* Get the movie titles corresponding to the top movie indices.

In [21]:
# Function for movie recommendations using the best model
def recommend_movies(user_id, movies_df, model, vectorizer, N):
    # Get all movies not rated by the user
    user_movies = merged_df.loc[merged_df['userId'] == user_id, 'movieId']
    unrated_movies = movies_df[~movies_df['movieId'].isin(user_movies)]

    # Concatenate the 'title' and 'genres' columns to create a text representation of each movie
    unrated_movies['text'] = unrated_movies['title'] + ' ' + unrated_movies['genres']

    # Create TF-IDF vectors for the unrated movies
    unrated_vectors = vectorizer.transform(unrated_movies['text'])

    # Predict ratings for the unrated movies using the model
    ratings_pred = model.predict(unrated_vectors)

    # Get the indices of the top N movies with the highest predicted ratings
    top_movie_indices = ratings_pred.argsort()[::-1][:N]

    # Get the movie titles corresponding to the top movie indices
    top_movie_titles = unrated_movies['title'].iloc[top_movie_indices]

    return top_movie_titles

In [22]:
# Example usage of the recommend_movies function
user_id = 1
N = 5
recommendations = recommend_movies(user_id, movies_df, best_model, vectorizer, N)
print(recommendations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unrated_movies['text'] = unrated_movies['title'] + ' ' + unrated_movies['genres']


314      Shawshank Redemption, The (1994)
25505          The Black Godfather (1974)
840                 Godfather, The (1972)
62422      Women of Devil's Island (1962)
21526                  Bright Road (1953)
Name: title, dtype: object
