## MOVIES RECOMMENDATION ENGINE

#### Load data in pandas dataframes and show tables

In [None]:
import pandas as pd

movies = pd.read_csv('D:\\Shubhi\\DataScience\\MoviesRecommendation\\MovieLens_Dataset\\movies_metadata.csv')
moviesDF = pd.DataFrame(movies)
moviesDF.head()


# Different Statistics

### 1. Count of movies grouped by their status

In [None]:
# Showing different kinds of data filtering

moviesDF.head(2)

# Total no. of movies in the dataset
print(f"Total no. of movies in the dataset: {moviesDF.shape[0]}" )

# Showing the total no. of movies in a particular status
moviesDF['count'] = 1
moviesDF.groupby('status').count()['count'].sort_values(ascending=False)

# Alternate and simple way to do it.
# moviesDF['status'].value_counts() # gives the count of different values


### Stat 1 Visualization through Barchart

In [None]:
# Visualize it through barchart
import plotly.express as px

moviesDFNN = moviesDF[moviesDF['status'].notnull()]

fig = px.histogram(moviesDFNN, x='status', y='count', color = 'status', text_auto = True, title = 'Fig 1: Bar chart of movies status')

fig.update_layout(xaxis_title='Status of the movie', yaxis_title='Total no. of movies')



### Stat 2. No. of movies status wise in a particular year

In [None]:
# Finding out the status of movies who have release date
moviesDF[moviesDF['release_date'].notnull()]['status'].unique()

In [None]:
# Showing the no. of movies released in a particular yearmoviesDF.columns

# Dataset contains two types of date formats. So converting them in one format 'dd-mm-YYYY'
date1 = pd.to_datetime(moviesDF['release_date'], errors='coerce',format='%d-%m-%Y')
date2 = pd.to_datetime(moviesDF['release_date'], errors='coerce',format='%Y-%m-%d')
moviesDF['release_date'] = date1.fillna(date2)
# print(moviesDF['release_date'].head(5))

# Extracted year 
# moviesDF['release_year'] = pd.DatetimeIndex(moviesDF['release_date']).year #Method 1
moviesDF['release_year'] = moviesDF[moviesDF['release_date'].notnull()]['release_date'].dt.year.astype('int', errors = 'ignore')
moviesDF['count'] = 1
# print(moviesDF['release_year'].head(5))

# Syntax 1 and 2 produces the same results
# moviesDF.groupby('release_year').count().sort_values('count',ascending=False)['count'] # syntax 1
moviesDF.groupby(['release_year','status']).count()['count']
#moviesDF.groupby('release_year').count()['count'].sort_values(ascending=False) # syntax 2 (this includes the count of 
                                                                                # movies in all status)



### Stat 2. Visualization through barchart and scatter plot

In [None]:
moviesDF1 = moviesDF[moviesDF['release_year'].notnull() & moviesDF['status'].notnull()]
fig_2 = px.histogram(moviesDF1, x = 'release_year', color = 'status', barmode='group', text_auto=True, title='Fig 2: Bar graph')
fig_2.show()

fig_3 = px.scatter(moviesDF1, x = 'release_year', color = 'status', hover_data = {'genres', 'title'},title='Fig 3: Scatter Plot')
fig_3.show()

# Simple Recommendor 

In [None]:
from ast import literal_eval
# movies = pd.read_csv('D:\\Shubhi\\DataScience\\MoviesRecommendation\\MovieLens_Dataset\\movies_metadata.csv')
# moviesDF = pd.DataFrame(movies)

# Converting genres columns to contain only names like [comedy,thriller]
moviesDF['genres'] = moviesDF['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# Ignoring null values and converting valid entries as int
vote_counts = moviesDF[moviesDF['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = moviesDF[moviesDF['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
print(f'Average Vote : {C}')

# Calculating 95 percentile value to know how much votes should a movie get to come into our chart list
m = vote_counts.quantile(0.95)
print(f'A movie should get atleast {m} votes in order to come into top charts')


In [None]:
# Extracting year from release_date
#moviesDF['year'] = pd.to_datetime(moviesDF['release_date'],errors = 'coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
moviesDF['release_year'] = moviesDF[moviesDF['release_year'].notnull()]['release_year'].astype('int')
moviesDF['release_year'].sort_values().head(5)

qualified = moviesDF[(moviesDF['vote_count'] >= m) & (moviesDF['vote_average'].notnull()) & (moviesDF['vote_count'].notnull())][['title','release_year','vote_count','vote_average','popularity','genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

In [None]:
# Function to calculate weighted rating
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)
qualified = qualified.sort_values('wr', ascending=False)
qualified_15 = qualified.head(15)
#qualified.shape[0]

In [None]:
qualified.shape[0]

In [None]:
fig_4 = px.bar(qualified_15, x='title', y='wr', text_auto = True, hover_data={'vote_count','vote_average'}, title='Fig 4: Top 15 movies sorted by weighted rating')
fig_4.update_layout(xaxis_title='Movie', yaxis_title='Weigted Rating')
fig_4.show()

In [None]:
moviesDF1 = moviesDF1.sort_values('vote_count', ascending=False)
moviesDF1_15 = moviesDF1.head(15)
fig_5 = px.bar(moviesDF1_15, x='title', y='vote_count', text_auto = True, hover_data={'vote_count','vote_average'}, title='Fig 5: Top 15 movies sorted by Vote Count')
fig_5.update_layout(xaxis_title='Movie', yaxis_title='Vote Counts')
fig_5.show()

In [None]:
moviesDF1 = moviesDF1.sort_values(['vote_average','vote_count'], ascending=False)
moviesDF1_15 = moviesDF1.head(15)
fig_6 = px.bar(moviesDF1_15, x='title', y='vote_average', text_auto = True, hover_data={'vote_count','vote_average'}, title='Fig 6: Top 15 movies sorted by Vote Average')
fig_6.update_layout(xaxis_title='Movie', yaxis_title='Vote Average')
fig_6.show()

##  Build charts for particular genres

In [None]:
# s = moviesDF.apply(lambda x: pd.Series(x['genres']),axis=1).stack()
# print(s.head(5))
s = moviesDF.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
#print(s.head(5))
s.name = 'genre'
gen_md = moviesDF.drop('genres', axis=1).join(s)
#gen_md.head(10)

### Stat 3: No. of movies genres wise in a particular year

In [None]:
genmd1 = gen_md[gen_md['release_year'].notnull() & gen_md['genre'].notnull()]
genmd1.groupby('genre').count().sort_values('count', ascending=False)['count']

# genmd1['genre'].value_counts() # Same result as above code

In [None]:
fig_7 = px.histogram(genmd1, x='genre', text_auto=True, hover_data={'count'}, title='Fig 7. Distribution of movies around genres')
fig_7.show()

In [None]:
# BUILD_CHART Function 

def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title','release_year','vote_count','vote_average','popularity','genre']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count'] + m) * x['vote_average']) + (m /(m + x['vote_count']) * C), axis = 1)
    qualified = qualified.sort_values('wr',ascending=False).head(100)
    return qualified

In [None]:
build_chart('Romance').head(20)