In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
data_rating = pd.read_csv('/kaggle/input/imdb-extensive-dataset/IMDb ratings.csv',low_memory=False) 
movies = pd.read_csv('/kaggle/input/imdb-extensive-dataset/IMDb movies.csv',low_memory=False)

In [None]:
movies_rate = pd.merge(movies,data_rating,how='left',on='imdb_title_id') # joint the two files on the basis of 'imdb_title_id'
movies_rate.columns
movies_rate.head()

In [None]:
movies_rate.drop(movies_rate.index[movies_rate['year'] == 'TV Movie 2019'], inplace = True)
movies.drop(movies.index[movies['year'] == 'TV Movie 2019'], inplace = True)

### Descriptive Statistics

In [None]:
movies_rate.info()

In [None]:
movies_rate.shape #Checking Shape of  Merged dataframe

In [None]:
movies_rate.describe().T #t=transpose the rows to columns

#### Insights:

1.looking at the description of the data we can see that the mean and standard deviation for columns such as ratings, votes.
2.Data points are more spreed as standard pints are far away from mean.
3.from this we also find an inference of the data is skewed.

### Checking for missing values and its percentages

In [None]:
total_missing=movies_rate.isnull().sum().sort_values(ascending=False)
percent=((movies_rate.isnull().sum()/movies_rate.isnull().count())*100).sort_values(ascending=False)
missing_data=pd.concat([total_missing,percent],axis=1,keys=['Missing_Total','Percent'])
missing_data.head(15)
#based on the data we take threshold as 50%, based on that we drop our columns. 
#because imputation will not be suitabe for columns with huge missing value percentages.
#So we can drop these columns.

In [None]:
movies_rate.nunique() #Checking for unique values in each column.

### Analysis on genre

In [None]:
# Movie Genres

movie_genres = movies['genre']
genres = [] #creating an empty list for genre and genre count.
genre_counts = []

for genre_list in movies['genre']:  
    for genre in genre_list.split(', '):  # delimiting genre on the bases of comma.
        if genre not in genres:
            genres.append(genre)  #appending the genres in empty list
            genre_counts.append(1) 
        else:
            genre_counts[genres.index(genre)] += 1
#sorting and ordering genres on the basis of count and percentage 
ordered_genres = [x for y, x in sorted(zip(genre_counts, genres))]
sorted_counts = [x / len(movie_genres) * 100 for x in sorted(genre_counts)]

#visualizing the graph 
plt.figure(figsize=(10, 10)) #
plt.barh(ordered_genres, sorted_counts)
plt.xlabel('Percentage of Films (%)')
plt.show()

#### Insights : 
1. From the above bar graph we can see that most of the people like watching Drama, Comedy and romance types of genres.
2. We can see that news, adult, documentary are the least prefered genres.

### Analysis on ratings over the years.

In [None]:
fig = plt.figure(figsize = (18,7))
sns.lineplot(data = movies_rate, x = 'year', y = 'top1000_voters_rating' )
plt.xlabel('Years')
x_ticks = np.arange(0, 113, 5)
plt.xticks(rotation=90)
plt.ylabel('Rating over the years')
plt.show()

#### Insights : 
1. From above line chart we can see that the ratings faced some major deflexes in the start of the 1894 to 1932 and drastically decreased till the 2000's and shows a sudden increase since then.
2. This way we can say that by the increase in the social media and the world turning digital the ratings play an important role for the metacritic industries as well as the user.

### Analysis on combined genres.

In [None]:
# Genre counts
import plotly.express as px  #imporing visualization library plotly
genre_count=movies_rate['genre'].value_counts() #genres count
genre_count = genre_count[:30,] #visualizing top 30 genres.
genre_count=pd.DataFrame(genre_count).reset_index() #creatin data frame from obtained by value count.
popular_genre=genre_count.rename(columns={'index':'Genres','genre':'Count'}) #renaming the column names.
fig = px.bar(popular_genre, y='Count', x='Genres', text='Genres') #ploting bar chart and updating the layout for the plot.
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside') 
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_layout(title_text='<b>Common Genres<b>',title_x=0.5)
fig.show()

#### Insights : 
1. Among all the genres people like drama, comedy, romance and horror as well as the combination of these three genres.

### Analysis on countries with most Analysis

In [None]:
# Country with Most Movie Releases
Movie_count=movies['country'].value_counts()  #country count
Movie_count = Movie_count[:30,] #visualizing top 30 country.
Movie_count=pd.DataFrame(Movie_count).reset_index() #creatin data frame from obtained by value count.
Movie_count=Movie_count.rename(columns={'index':'Country','country':'Count'}) #renaming the column names.
fig = px.bar(Movie_count, y='Count', x='Country', text='Country') #ploting bar chart and updating the layout for the plot.
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_layout(title_text='<b>Countries with most movies releases<b>',title_x=0.5)
fig.show()

#### Insights :
1. Here we can see that the country that release most movies per year is USA.

### Analysis on USA's per year movie release.

In [None]:
USA = movies.loc[(movies['country'] == 'USA')]
s = USA['year'].value_counts()

In [None]:
USA_Movie_count=pd.DataFrame(s).reset_index()
USA_Movie_count=USA_Movie_count.rename(columns={'index':'year','year':'Count'}) #renaming the column names.
fig = px.bar(USA_Movie_count, y='Count', x='year', text='year') #ploting bar chart and updating the layout for the plot.
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_layout(title_text='<b>USA per year movies releases<b>',title_x=0.5)
fig.show()

#### Insights :
1. France was the first country to invent and release cinema in 1891, And 5 years later in USA the cinema arrived (1896).
2. We can see that as the years increase the amount of movies released also increases.
3. The major shift the hiped the release is in the years 2015 and 2016.
4. The golden time of the cinema was between 1913 - 1969, before World War 1 the Hollywood industries grew and then the industry grew in New York. Making them the hot spots to Hier, Produce, shoot and release more films from USA.

### Analysis per year movie release in INDIA

In [None]:
IND= movies.loc[(movies['country'] == 'India')]
i = IND['year'].value_counts()
IND

In [None]:
IND_Movie_count=pd.DataFrame(i).reset_index()
IND_Movie_count=IND_Movie_count.rename(columns={'index':'year','year':'Count'}) #renaming the column names.
fig = px.bar(IND_Movie_count, y='Count', x='year', text='year') #ploting bar chart and updating the layout for the plot.
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_layout(title_text='<b>IND per year movies releases<b>',title_x=0.5)
fig.show()

#### Insights: 
1.India is the second largest producer of the movies, after USA.
2.The number of movies produced in a year has significantly increased over the years all over the globe. 
3.In between the year 2010 and 2013 we see that the movies released we more, this might be due to the hype in industrial budget.

### Analysis on total year release

In [None]:
# Country with Most Movie Releases
import plotly.graph_objects as go
Movie_year=movies['year'].value_counts()
Movie_year=pd.DataFrame(Movie_year).reset_index()
Movie_year=Movie_year.rename(columns={'index':'Year','year':'Total_Releases'}).sort_values(by='Year',ascending=True)
fig = px.line(Movie_year, y='Total_Releases', x='Year')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='show')
fig.update_layout(title_text='<b>Yearly Total Releases<b>',title_x=0.5)
fig.show()

1. this trend line shows that there is a constant increase in the releases over the years.

### Temporal Analysis

In [None]:
# Convertig DatePublished column from object to dateTime
movies['date_published']=movies['date_published'].str.rstrip('aAbBcC')
movies['date_published']=pd.to_datetime(movies['date_published'],errors='coerce')

In [None]:
movies['Month']=movies['date_published'].dt.month #Extracting Month from date Published Column
movies['Weekday']=movies['date_published'].dt.weekday #Extracting Weekday from date Publishe Column

In [None]:
#months
movies['Month_Name']=pd.np.where(movies['Month']==0,'January',
                 pd.np.where(movies['Month']==1,'February',
                 pd.np.where(movies['Month']==2,'March',
                 pd.np.where(movies['Month']==3,'April',
                 pd.np.where(movies['Month']==4,'May',
                 pd.np.where(movies['Month']==5,'june',
                 pd.np.where(movies['Month']==6,'July',
                 pd.np.where(movies['Month']==7,'August',
                 pd.np.where(movies['Month']==8,'September',
                 pd.np.where(movies['Month']==9,'October',
                 pd.np.where(movies['Month']==10,'November',
                 pd.np.where(movies['Month']==11,'December','January'))))))))))))

### Analysis on Monthly releases

In [None]:
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']
x=movies['Month_Name'].value_counts()
x=pd.DataFrame(x).reset_index()
x=x.rename(columns={'index':'Month_Name','Month_Name':'Count'})
import plotly.express as px
df = x
#fig = px.pie(df, values='Count', names='Bed_Type',hole=0.5)
fig = go.Figure(data=[go.Pie(labels=x['Month_Name'],values=x['Count'],hole=0.5)])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.update_layout(title_text='Month-Releases<b>',title_x=0)
fig.show()

Seems like the best Moth to release Movies are in Winters. Producers and distributors who are marketing their blockbuster
films for both, revenues, and awards, tend to target the winter months of November and December, which also includes 
the Holiday season. During the season of Thanksgiving, Christmas and New Year’s, the audience seeks epic dramas and 
award-worthy films in cinemas and for at-home entertainment.

### Analysis on weekly releases


In [None]:
#Weekdays
movies['Day']=pd.np.where(movies['Weekday']==0,'Sunday',
               pd.np.where(movies['Weekday']==1,'Monday',
               pd.np.where(movies['Weekday']==2,'Tuesday',
               pd.np.where(movies['Weekday']==3,'Wednesday',
               pd.np.where(movies['Weekday']==4,'Thursday',
               pd.np.where(movies['Weekday']==5,'friday',
               pd.np.where(movies['Weekday']==6,'Saturday',0)))))))

In [None]:
#Weekly Reviews Count
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']
x=movies['Day'].value_counts()
x=pd.DataFrame(x).reset_index()
x=x.rename(columns={'index':'WeekDay','Day':'Count'})
import plotly.express as px
df = x
fig = go.Figure(data=[go.Pie(labels=x['WeekDay'],values=x['Count'],hole=0.5)])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.update_layout(title_text='<b>Weekday with most releases<b>',title_x=0)
fig.show()

#### Insights
1. Movies are released on Thursday in hopes of beating the Friday rush. As ticket prices are increasing, most people are not willing to spend more money to watch 2 movies in the theater.
2. This is to boost weekend Box Office numbers; boosting a movies ratings! More money equals more reason for people to watch this movie!
3. From the theater point of view, they do this in order to beat competition; as in other movie theaters. The earlier the movie is released, people will flock to the theater to watch it first. This will guarantee the money for the theater that released the movie first!

### (Duration/Run-Time) of Movies when there where silent Films (Till 1929)

In [None]:
movies['year'].min()

In [None]:
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
import seaborn as sns
plt.figure(figsize=(15,8))
rutime=sns.distplot(movies['duration'],color='darkred')

#### Insights :
During the silent era 

1.The scope of study is based on the silent era and after the silent era.

### Analysis on the movies after the 'Silent Era'


In [None]:
bef_1930=movies.loc[(movies['year'] <='1930')] 
aft_1930=movies.loc[movies['year']>='1931'] #or (df['column_name'] <= '1930')]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(15,8))
rutime_bef1930=sns.distplot(bef_1930['duration'],color='darkorange')
rutime_aft1930=sns.distplot(aft_1930['duration'],color='firebrick')

#### Insights : 
Finding outliers.
1. Here we can see the actual duration of the time with respect to years before and after 1930, the max duration is 800 which we consider as an outlier.
2. And before 1930 we have 400 as an outlier.

In [None]:
movies['director'].value_counts().head(10) #top 10 directors with most movie directed .

## Viz on rating vs total releases

In [None]:
# Mean Rating vs Movie Releases 
mov_rat_group=movies_rate.groupby("year")["total_votes"].mean()
review_count=pd.DataFrame(mov_rat_group).reset_index()
review_count=review_count.rename(columns={'year':'Year'}, inplace = False)
release_ratings=pd.merge(Movie_year,review_count,how='left',on='Year')
ax = release_ratings.plot("Year", "Total_Releases", kind="line", label = 'total Release')
release_ratings.plot("Year", "total_votes", ax= ax , kind="line", label = 'Votes', title= 'Avg.Rating vs Total Releases ',figsize=(14,7)) 

#Method 2
# plt.plot(release_ratings["Year"], df["Sales"])
# plt.title("Simple Line Plot")
# plt.xlabel('Year')
# plt.ylabel('Sales')
# release_ratings.plot.line(subplots=True,figsize=(12,12))
# plt.xlabel('Year')# Set the x axis label of the current axis.
# plt.ylabel('Counts')# Set the y axis label of the current axis.
# plt.title('Review and Release Per Year')# Set a title 
# plt.show()# Display the figure.

#### Insights :
1. With the increase in technology we can see that the votes as well as total movie releases have significantly increased over the years.

In [None]:
# Top Movies Produced Ever Year as per Total Votes
x1=movies_rate.groupby(["year","title","genre","director"])['total_votes'].agg(['max']).sort_values("max",ascending = False)
x2=x1.groupby(["year","title","genre","director"]).agg({'max':np.max}).sort_values(by="max",ascending = False).groupby(level=0).head(1)
x2

### Duration vs Year

In [None]:
fig = plt.figure(figsize = (18,10))
sns.lineplot(data = movies_rate, x = 'year', y = 'duration' )

plt.xlabel('Year of the movie')
x_ticks = np.arange(0, 113, 5)
plt.xticks(rotation=90)
plt.ylabel('Duration of the movie')

### Insights :
1. The average runtime if the movies is 90 - 100 percent.
2. With the start of the movie era the average duration of the movies has increased with respect to the year.
3. There is also a study which shows that the shots of the modern films have decreased as compared to the films 60 years back, which  also shows us that angle by each shot leads to the increase in the duration.

In [None]:
# Checking Columns which are highly correlated with each others
cor_target = data_rating.corr().abs()
Feature_corr = cor_target.unstack().to_frame(name='Correlation') # Feature Relation
Feature = Feature_corr[(Feature_corr['Correlation']>=0.80)&(Feature_corr['Correlation']<1)].sort_values(by='Correlation', ascending = False).reset_index()
display(Feature)

In [None]:
movies['description'] =  movies['description'].astype(str) #the main reason we form word cloud on description is because it is a text column 

In [None]:
# Wordcloud to vizualize descriptio column
from wordcloud import WordCloud
plt.figure(figsize=(16,8))
wc = WordCloud(background_color="black", max_words=200,max_font_size=50,random_state=42)
wc.generate(' '.join(movies['description']))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

#### Insights :
1. Based on the description of the movie  the words which have repeated a several times are life, family, friends, father, love, story, find aswe can see from the above word cloud.
2. that means most of the movies which are produced/ploted on are of these specific nouns.