In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import joblib

In [2]:
df = pd.read_csv('dataset2names.csv', low_memory = False, na_values = '-')

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
print("Dataset Preview:")
df.head()

Dataset Preview:


Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


In [6]:
imputer = SimpleImputer(strategy='most_frequent')
df['IMDB_Rating'] = imputer.fit_transform(df[['IMDB_Rating']])
df['Meta_score'] = imputer.fit_transform(df[['Meta_score']])
df['Gross'] = df['Gross'].str.replace(',', '').astype(float)
df['Gross'] = imputer.fit_transform(df[['Gross']])

In [7]:
l = []
l_less = []
for i in df.columns:
    if ((df[i].isnull().sum()<(0.3 *(df.shape[0])))):
        l.append(i)
    else:
        l_less.append(i)

In [8]:
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


In [9]:
encoder = LabelEncoder()
df['Genre'] = encoder.fit_transform(df['Genre'])
df['Certificate'] = encoder.fit_transform(df['Certificate'])
df['Director'] = encoder.fit_transform(df['Director'])
df['Star1'] = encoder.fit_transform(df['Star1'])
df['Star2'] = encoder.fit_transform(df['Star2'])
df['Star3'] = encoder.fit_transform(df['Star3'])
df['Star4'] = encoder.fit_transform(df['Star4'])

In [10]:
# Save the LabelEncoder instances for categorical features
joblib.dump(encoder, 'encoder.joblib')

['encoder.joblib']

In [11]:
df['Released_Year'] = pd.to_numeric(df['Released_Year'], errors='coerce')
df['Released_Year'] = imputer.fit_transform(df[['Released_Year']])

In [12]:
features = ['Genre', 'Released_Year', 'Certificate', 'Director', 'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes']
target = 'IMDB_Rating'
X = df[features]
y = df[target]

In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
# Save the scaler
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']

In [15]:
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [16]:
knn = KNeighborsRegressor()

In [17]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

In [18]:
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [19]:
best_knn = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}


In [20]:
# Save the best KNN model
joblib.dump(best_knn, 'best_knn_model.joblib')

['best_knn_model.joblib']

In [21]:
y_val_pred = best_knn.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
print(f"Validation Mean Squared Error: {val_mse}")

Validation Mean Squared Error: 0.06286400134375261


In [22]:
y_test_pred = best_knn.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
print(f"Test Mean Squared Error: {test_mse}")

Test Mean Squared Error: 0.05400013949411939


In [23]:
def recommend_movies(movie_title, knn_model, n_recommendations=10):
    # Find the index of the movie
    movie_idx = df[df['Series_Title'] == movie_title].index[0]

    # Get the feature vector of the selected movie
    movie_features = X_scaled[movie_idx].reshape(1, -1)

    # Find the K nearest neighbors (excluding the movie itself)
    distances, indices = knn_model.kneighbors(movie_features, n_neighbors=n_recommendations + 1)

    # Get the recommended movie indices (excluding the movie itself)
    recommended_indices = indices.flatten()[1:]

    # Create a DataFrame for recommended movies
    recommendations = df.iloc[recommended_indices][['Series_Title', 'IMDB_Rating']]

    return recommendations

In [24]:
# Get recommendations for a specific movie
selected_movie = 'The Shawshank Redemption'
recommended_movies = recommend_movies(selected_movie, best_knn, n_recommendations=10)
print(f"Movies similar to '{selected_movie}':")
print(recommended_movies)

Movies similar to 'The Shawshank Redemption':
                 Series_Title  IMDB_Rating
182     Judgment at Nuremberg          8.2
383                  Magnolia          8.0
342         Bohemian Rhapsody          8.0
615                     Taken          7.8
578  Kubo and the Two Strings          7.8
91       Miracle in cell NO.7          8.3
406        The Princess Bride          8.0
213                Inside Out          8.1
65           Taare Zameen Par          8.4
74             Apocalypse Now          8.4
