<h1 align="center"> Group 7 Analysis & visualization </h1>

<h3 align="center">10/20/2022</h3>

#### Importing python libraries for analysis and visualization

In [None]:
#data analysis libraries 
import numpy as np
import pandas as pd
import re

#visualization libraries
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
#import train and test CSV files
movie = pd.read_csv("./DataSets/movies.csv")
ratings = pd.read_csv("./DataSets/ratings.csv")
links = pd.read_csv("./DataSets/links.csv")

In [None]:
#take a look at the training data
print(movie.shape)
print(ratings.shape)
print(links.shape)

In [None]:
print(movie.head())
print(links.head())
print(ratings.head())

In [None]:
#get a list of the features within the dataset
print("Movie : ", movie.columns,end="\n\n")
print("Rating : ", ratings.columns,end="\n\n")
print("Links : ", links.columns,end="\n\n")

In [None]:
movie.info()
ratings.info()
links.info()

In [None]:
# Droping the timestamp column from ratings and tags file
ratings.drop(columns='timestamp',inplace=True)

In [None]:
movies_df_exploded = movie[:]
movies_df_exploded['genres'] = movies_df_exploded['genres'].apply(lambda x: x.split('|'))
movies_df_exploded = movies_df_exploded.explode('genres')
px.histogram(movies_df_exploded, x='genres', height=400, title='Movie count by genre').update_xaxes(categoryorder="total descending")

In [None]:
movie.head()

In [None]:
#Extracting the year from the Title
movie['Year'] = movie['title'].str.extract('.*\((.*)\).*',expand = False)
px.histogram(movie, x='Year', height=400, title='Movie count by year').update_xaxes(categoryorder="total descending")

In [None]:
#Ploting a Graph with No.of Movies each Year corresponding to its Year
plt.plot(movie.groupby('Year').title.count())
plt.show()
a=movie.groupby('Year').title.count()
print('Max No.of Movies Relesed =',a.max())
for i in a.index:
    if a[i] == a.max():
        print('Year =',i)
a.describe()

In [None]:
genre_dat = movie['genres'].str.get_dummies('|')
genre_dat

In [None]:
mov_dat2 = pd.concat([movie, genre_dat], axis=1).drop(['title','genres'], axis=1)
mov_dat2

In [None]:
x={}
for i in mov_dat2.columns[4:23]:
    x[i]=mov_dat2[i].value_counts()[1]
    print("{}    \t\t\t\t{}".format(i,x[i]))

plt.bar(height=x.values(),x=x.keys(), )
plt.xticks(rotation=70)
plt.show()

In [None]:
#Add a Column `rating` in movie DF and assign them with the Mean Movie Rating for that Movie.
x=ratings.groupby('movieId').rating.mean()
movie = pd.merge(movie, x, how='outer', on='movieId')
movie['rating'].fillna('0',inplace=True)

In [None]:
# Now Lets group all the ratings with respect to movieId and count the no of Users
x = ratings.groupby('movieId',as_index=False).userId.count()
x.sort_values('userId',ascending=False,inplace=True)
y = pd.merge(movie,  x, how='outer',on='movieId')
y.sort_values(['userId','rating'],ascending=False)

From the above table we can say that Forest Gump (1994) has the Highest no.of. Ratings with 4.16 Stars by 329 Users as average.

Followed by Shawshank Redemption, The (1994) with 317 User Ratings and 4.42 Stars as average and so on.


In [None]:
#find the user with highest no.of. movie ratings and that users mean rating. 
x = ratings.groupby('userId',as_index=False).movieId.count()
y = ratings.groupby('userId',as_index=False).rating.mean()
x = pd.merge(x,y,how='outer',on='userId')
x.describe()

From above we found that userId - 414 has given the Highest no.of Ratings with 2698 and an Average of 3.39 Stars followed by userId-599 with 2478 Ratings and an Average of 2.64 Stars and so on …

In [None]:
x.sort_values('movieId',ascending=False)

In [None]:
movies_df_exploded.head()

In [None]:
rating_by_genre_df = ratings.join(movies_df_exploded, on='movieId').groupby('genres').agg({'rating': ['mean', 'count']}).sort_values(('rating', 'mean')).reset_index()
rating_by_genre_df.columns = ['_'.join(col).strip() for col in rating_by_genre_df.columns.values]
px.bar(rating_by_genre_df, x='genre_', y='rating_mean', height=300)

In [None]:
# store full dataframe 
movie.drop(columns='rating',inplace=True)
movie_df = pd.concat([movie, genre_dat], axis=1).drop(['genres'], axis=1)
full_df = pd.merge(movie_df, ratings, how="left", on="movieId")
full_df.head()

In [None]:
# return number of rows associated to each title
top_ten_movies = full_df.groupby("title").size().sort_values(ascending=False)[:10]

# plot the counts
plt.figure(figsize=(12, 5))
plt.barh(y= top_ten_movies.index,
         width= top_ten_movies.values)
plt.title("10 Most Rated Movies in the Data", fontsize=16)
plt.ylabel("Moive", fontsize=14)
plt.xlabel("Count", fontsize=14)
plt.show()

In [None]:
movies_rated = ratings.groupby("userId").size().sort_values(ascending=False)
print(f"Max movies rated by one user: {max(movies_rated)}\nMin movies rated by one user: {min(movies_rated)}")
ratings.userId.value_counts().plot.box(figsize=(20, 40))
plt.title("Number of Movies rated by a Single user", fontsize=16)
plt.show()

The maximum number of movies rated by a single user in the dataset is 737 — whoever that is, is a very loyal movie watcher and rater — and the median number of movies rated by someone is 70. There are plenty of outliers that have rated more than 320 movies which is what I am approximating to be the extreme value from the plot above.

In [None]:
print("Movie : ", full_df.columns,end="\n\n")

In [None]:
genres= ['Action', 'Adventure',
       'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery',
       'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# https://github.com/HarilalOP/movielens-data-exploration/blob/master/src/main/code/exploratory_analysis.ipynb
df_temp = full_df[['movieId','rating']].groupby('movieId').mean()
# Histogram of all ratings
df_temp.hist(bins=25, grid=False, edgecolor='b', density=True, label ='Overall', figsize=(15,8))
# KDE plot per genre
for genre in genres:
    df_temp = full_df[full_df[genre]==True][['movieId','rating']].groupby('movieId').mean()
    df_temp.rating.plot(grid=True, alpha=0.9, kind='kde', label=genre)
plt.legend()
plt.xlim(0,5)
plt.xlabel('Rating')
plt.title('Rating Density plot')
plt.show()

The plot is predominantly left-skewed for most genres — This could possibly by down to users being more willing to rate movies they enjoyed, since people do not really watch a movie if they aren’t enjoying it. We would have to conduct some research on whether this is the case in our instance.

Ok, the last plot was more complicated. We can simplify things again by looking more specifically at the users.

In [None]:
genres_unique = pd.DataFrame(movie.genres.str.split('|').tolist()).stack().unique()
genres_unique = pd.DataFrame(genres_unique, columns=['genre']) # Format into DataFrame to store later
movie = movie.join(movie.genres.str.get_dummies().astype(bool))
movie.drop('genres', inplace=True, axis=1)

## WEB SCRAPING

In [None]:
# Import required modules

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [None]:
# Use back slash to break a statement into multiple lines

url_temp = 'https://www.themoviedb.org/movie/{movieId}/'
movie_web = pd.DataFrame({'movieId': movie.movieId})
# Add PageURL column
movie_web['PageURL'] = [url_temp.format(movieId = movie_url.movieId[i]) for i in range(0,len(movie_url))]
movie_web.head()

In [None]:
# Create a new column to store HTML files

movie_web['HTML'] = [requests.get(movie_web.loc[i,'PageURL']).content for i in range(0,len(movie_web))]

movie_web

In [None]:
movie_web['Content_score'] = [BeautifulSoup(movie_web.loc[i,'HTML'],'html.parser')
                              .find(lambda tag: tag.name == 'div' and tag.get('class') == ['content_score'])
                              .find(lambda tag: tag.name == 'p').text for i in range(0,len(movie_web))]