In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew
import numpy as np

# Load and check the

In [2]:
movie = pd.read_csv("IMDB-Movie-Data.csv")
movie

FileNotFoundError: [Errno 2] No such file or directory: 'IMDB-Movie-Data.csv'

In [None]:
movie.head(10)

In [None]:
movie.shape

In [None]:
movie.info()

# 1. Check Missing value

In [None]:
print('Any missing value==> ',movie.isnull().values.any())

In [None]:
movie.isnull().sum()

In [None]:
sns.heatmap(movie.isnull())

In [None]:
per_missing = movie.isnull().sum() * 100/ len(movie)
per_missing

# 2. Drop missing values

In [None]:
movie.dropna(axis=0)
movie

# 3. Check for duplicate data

In [None]:
print('Are there any duplicate values?',movie.duplicated().any())

In [None]:
movie.drop_duplicates()
movie

# 4. Plot boxplot for each dataset

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.flatten()
sns.boxplot(data=movie, x='Rating', color='blue', ax=axes[0])
axes[0].set_title('Rating')
sns.boxplot(data=movie, x='Votes', color='green', ax=axes[1])
axes[1].set_title('Votes')
sns.boxplot(data=movie, x='Revenue (Millions)', color='red', ax=axes[2])
axes[2].set_title('Revenue (Millions)')
sns.boxplot(data=movie, x='Metascore', color='purple', ax=axes[3])
axes[3].set_title('Metascore')

# 5. Replace Outlier

In [None]:
for column in ['Rating', 'Votes']:
    q1 = movie[column].quantile(0.25)
    q2 = movie[column].quantile(0.75)
    IQR = q2-q1
    max_limit = q2 + (1.5 * IQR)
    min_limit = q1 - (1.5 * IQR)
    movie[column] = pd.DataFrame(np.where(movie[column] > max_limit, max_limit,
     (np.where(movie[column] < min_limit, min_limit, movie[column]))), columns=[column])
        

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.flatten()
sns.boxplot(data=movie, x='Rating', color='blue', ax=axes[0])
axes[0].set_title('Rating')
sns.boxplot(data=movie, x='Votes', color='green', ax=axes[1])
axes[1].set_title('Votes')
sns.boxplot(data=movie, x='Metascore', color='purple', ax=axes[3])
axes[3].set_title('Metascore')

# 6. Get overall statistics about the data

In [None]:
movie.describe(include='all')

# 7. Which year there was the higest average voting?

In [None]:
movie.groupby('Year')['Votes'].mean().sort_values(ascending=False)

In [None]:
sns.barplot(x='Year',y='Votes',data=movie)
plt.title("Votes By Year")
plt.show()

# 8. Which year there was the highest average revenue?

In [None]:
movie.groupby('Year')['Revenue (Millions)'].mean().sort_values(ascending=False)

In [None]:
sns.barplot(x='Year',y='Revenue (Millions)',data=movie)
plt.title("Revenue By Year")
plt.show()

# 9. Find the average rating for each Title

In [None]:
movie.columns

In [None]:
movie.groupby('Title')['Rating'].mean().sort_values(ascending=False)

# 10. Display top 10 movies title and rating

In [None]:
top10_rating = movie.nlargest(10,'Rating')[['Title','Rating']].set_index('Title')
top10_rating

In [None]:
sns.barplot(x='Rating',y=top10_rating.index,data=top10_rating)

# 11. Count number of movies in year

In [None]:
movie['Year'].value_counts()

In [None]:
sns.countplot(x='Year',data=movie)
plt.title("The Number of movies for year")
plt.show()

# 12. Find most popular movie title(Higest Revenue)

In [None]:
movie[movie['Revenue (Millions)'].max()==movie['Revenue (Millions)']]['Title']

# 13. Display 10 higest rated movie titles and its directors

In [None]:
top10_title = movie.nlargest(10,'Rating')[['Title','Rating','Director','Votes']].set_index('Title')
top10_title

In [None]:
sns.barplot(x='Rating',y=top10_title.index,data=top10_title,hue='Director',dodge=False)
plt.legend(bbox_to_anchor=(1.05,1),loc=2)

# 14. Find the highest 10 revenue movies

In [None]:
top10_revenue=movie.nlargest(10,'Revenue (Millions)')[['Title','Revenue (Millions)','Director','Year']].set_index('Title')
top10_revenue

In [None]:
sns.barplot(x='Revenue (Millions)',y=top10_revenue.index,data=top10_revenue,hue='Director',dodge=False)
plt.legend(bbox_to_anchor=(1.05,1),loc=2)
plt.title("The highest revenue movies")
plt.show()

# 15. Classification rating movies(Excellent,Good,Average)

In [None]:
def rating(rating):
    if rating>=7.0:
        return "Excellent"
    elif rating>=6.0:
        return "Good"
    else:
        return "Average"

In [None]:
movie['rating_cat'] = movie['Rating'].apply(rating)

In [None]:
movie

# 16. Univariate Analysis

In [None]:
sns.histplot(data=movie['Rating'],kde=True)
plt.title("Distribution Rating of movies")
plt.show()

In [None]:
skew(movie["Rating"])

In [None]:
sns.histplot(data=movie['Revenue (Millions)'],kde=True)
plt.title("Distribution Revenue (Millions) of movies")
plt.show()

In [None]:
skew(movie["Revenue (Millions)"])

In [None]:
sns.histplot(data=movie['Votes'],kde=True)
plt.title("Distribution Votes of movies")
plt.show()

In [None]:
skew(movie["Votes"])

# 17. Find correlation of dataset

In [None]:
sns.heatmap(np.round(movie.corr(numeric_only=True),2),annot=True)
plt.title("Multivariate analysis")
plt.show()

# 18 Apply Machine learning to recommend the titles of movie to customers

In [None]:
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
movie.columns

In [None]:
# select the features for recommendation movie
select_feature = ['Genre','Description','Director','Actors']  
print(select_feature)

In [None]:
# replacing the null valuess with null string
for feature in select_feature:
  movie[feature] = movie[feature].fillna('')

In [None]:
# combining all the 5 selected features
combine_feature = movie['Genre'] +' ' +movie['Description'] +' ' +movie['Director'] +' ' +movie['Actors']
print(combine_feature)

In [None]:
# converting the text data to feature vectors
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combine_feature)
print(feature_vectors)

# Cosine Similarity

In [None]:
# getting the similarity scores using cosine similarity
similarity = cosine_similarity(feature_vectors)
print(similarity)
print(similarity.shape)

# Getting movie title form user

In [None]:
movie_title = input("Please Input a title of movie: ")

In [None]:
# creating a list with all the movie names given in the dataset
#movie['Title'] = movie['Title'].astype(int)
list_of_all_titles = movie['Title'].tolist()
print(list_of_all_titles)

In [None]:
# finding the close match for the movie name given by the user
find_close_match = difflib.get_close_matches(movie_title, list_of_all_titles)
print(find_close_match)
close_match = find_close_match[0]
print(close_match)

In [None]:
# finding the index of the movie with title
index_of_the_movie = movie[movie.Title == close_match]['Rank'].values[0]
print(index_of_the_movie)

In [None]:
# getting a list of similar movies
similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

In [None]:
# sorting the movies based on their similarity score
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies)

# print the name of similar movies based on the index


In [None]:
print('Movies suggested for you:\n')
i = 1
for movies in sorted_similar_movies:
    index = movies[0]
    title_from_index = movie.loc[index, 'Title']  
    print(f"{i}. {title_from_index}")
    i += 1
    if i >= 10:
        break 


# Movie recommendation system 

In [None]:
movie_title = input("Please Input a title of movie: ")
list_of_all_titles = movie['Title'].tolist()
find_close_match = difflib.get_close_matches(movie_title, list_of_all_titles)
close_match = find_close_match[0]
index_of_the_movie = movie[movie.Title == close_match]['Rank'].values[0]
similarity_score = list(enumerate(similarity[index_of_the_movie]))
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print('Movies suggested for you:\n')

i = 1
for movies in sorted_similar_movies:
    index = movies[0]
    title_from_index = movie.loc[index, 'Title']
    
    print(f"{i}. {title_from_index}")

    i += 1
    if i > 10:
        break 