### Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

from wordcloud import WordCloud

import warnings
warnings.filterwarnings('ignore')

### Read Data

In [None]:
data = pd.read_csv("../input/wiki_movie_plots_deduped.csv")

### Statistical Analysis

In [None]:
# Show first 10 rows of data
data.head(10)

**Brief description of data:**
The dataset contains descriptions of 34,886 movies from around the world. 

* Release Year: Year in which the movie was released
* Title: Movie title
* Origin/Ethnicity: Origin of movie (i.e. American, Bollywood, Tamil, etc.)
* Director: Director(s)
* Cast: Main actor and actresses
* Genre: Movie Genre(s)
* Wiki Page: URL of the Wikipedia page from which the plot description was scraped
* Plot: Long form description of movie plot

In [None]:
# Shape of data
data.shape

In [None]:
# Describe the data
data.describe()

In [None]:
# Info of data
data.info()

In [None]:
# Show columns of data
data.columns

In [None]:
data.dtypes

### Visualization

**The frequency of movies by Origin/Ethnicity:**

In [None]:
ax = data['Origin/Ethnicity'].value_counts().sort_index().plot.bar(
    figsize = (10, 5),
    fontsize = 14)

ax.set_title("Count of Origin/Ethnicity of Movies", fontsize=16)
plt.xlabel('Origin/Ethnicity', fontsize=20)
plt.ylabel('Counts', fontsize=20)
sns.despine(bottom=True, left=True)

Origin/Ethnicity of American movies are far more than any other movies.

**Count of movies per year**

In [None]:
ax = data['Release Year'].value_counts().sort_index(ascending=True)

Sct = [go.Scatter(x = ax.index, y = ax.values, mode = 'lines', name = 'lines')]
layout = go.Layout(title = 'Movies by year')
fig = go.Figure(data = Sct, layout = layout)
iplot(fig)

There are 1021 movies released in 2013 which is the highest number and suddenly there is a major drop in 2015 which is only 661.

**WordCloud**

WordCloud of Movies Plot

In [None]:
wordcloud = WordCloud(width = 1000, height = 600, max_font_size = 120, max_words = 50).generate(" ".join(data.Plot))

plt.subplots(figsize=(18,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

There are some of the most frequent words used in all the movie plots like father, mother, tell, return, friend, find, help and many more.

**Top titles**

In [None]:
plt.figure(figsize=(16,8))
plt.title('Top titles',fontsize=25)
plt.xlabel('Title', fontsize=30)

sns.countplot(data.Title,order=pd.value_counts(data.Title).iloc[:15].index,palette=sns.color_palette("plasma", 15))

plt.xticks(size=16,rotation=90)
plt.yticks(size=16)
sns.despine(bottom=True, left=True)
plt.show()

**Most frequent Genres**

In [None]:
# Removing unknown genres
Gen = data[data.Genre != "unknown"]

plt.figure(figsize=(16,8))
plt.title('Most frequent Genre types',fontsize=30)
plt.xlabel('Genre', fontsize=25)
plt.ylabel('Count', fontsize=25)

sns.countplot(Gen.Genre,order=pd.value_counts(Gen.Genre).iloc[:15].index,palette=sns.color_palette("copper", 15))

plt.xticks(size=16,rotation=90)
plt.yticks(size=16)
sns.despine(bottom=True, left=True)
plt.show()

Drama and Comedy are the top two frequent genres.

In [None]:
# from https://www.kaggle.com/tatianasnwrt/wikipedia-movie-plots-eda

# Getting rid of null values and invisible characters (non-breaking spaces) 
top_cast = data[(data.Cast.notnull()) & (data.Cast != " ")] 
top_cast.set_index("Cast",inplace=True) 
top_cast.rename(index={'Three Stooges':'The Three Stooges'},inplace=True)

plt.figure(figsize=(22,15))
plt.title('Top Cast',fontsize=30)

sns.countplot(y=top_cast.index,order=pd.value_counts(top_cast.index)[:20].index,palette=sns.color_palette("rocket", 25)) 

plt.ylabel('Cast',fontsize=30)
plt.xlabel('Number of movies participated',fontsize=30)
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()

**Top directors**

In [None]:
# Removing Unknown directors to get a clear picture of our data
Dir = data[data.Director != "Unknown"]

plt.figure(figsize=(22,8))
plt.title('Top Directors',fontsize=30)

sns.countplot(Dir.Director, order=pd.value_counts(Dir.Director)[:20].index, palette=sns.color_palette("seismic", 20))

plt.xlabel('Directors',fontsize=25)
plt.ylabel('Number of movies directed',fontsize=25)
plt.xticks(size=16,rotation=90)
plt.show()

**If you like it please upvote this kernel and if you have any suggestions or want to say anything please leave a comment.**