In [None]:
import sys
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import missingno as msno
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
dframe = pd.read_csv('../input/marvel-vs-dc-imdb-dataset/Marvel_DC_imdb.csv')

In [None]:
dframe.head()

In [None]:
dframe.isna().sum()

In [None]:
dframe.shape

## Plot missing values 

In [None]:
msno.matrix(dframe) #shows metascore and USA_Gross cloumns have almost all missing data 

In [None]:
dframe = dframe.drop(columns=['USA_Gross', 'Metascore', 'Unnamed: 0'])

In [None]:
dframe.head()

In [None]:
dframe.info()

In [None]:
dframe.describe()

In [None]:
dframe = dframe.dropna(axis=0) #dropping all missing values from rows of dframe

In [None]:
dframe.isna().sum()

In [None]:
dframe.shape #we are left with 1247 rows, 443 rows has missing data 

In [None]:
dframe.columns

In [None]:
dframe.Rating.value_counts

In [None]:
dframe.dtypes

## Word Cloud on Movies Column

In [None]:
comment_words = ''
stopwords = set(STOPWORDS)
  
# iterate through the csv file
for val in dframe['Movie'].values:
      
    # typecaste each val to string
    val = str(val)
  
    #split the value
    tokens = val.split()
      
    # Converts each token into lowercase
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
      
    comment_words += " ".join(tokens)+" "
    
    
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, 
                      background_color='black', colormap='Set2', 
                      collocations=False, stopwords = STOPWORDS).generate(comment_words)
  
#plot the WordCloud image                       
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad = 0)
  
plt.show()

In [None]:
#dframe.plot(x = 'Year', y = 'IMDB_Score', ylim = ([0,10]), linewidth=2.5, figsize = (30,10))

# Pre-Processing

In [None]:
def convert_runtime(val):
    """
    Convert the string number value to a float
     - Remove $
     - Remove commas
     - Convert to float type
    """
    new_val = val.replace('min','')
    return int(new_val)

In [None]:
converted_RunTime = dframe['RunTime'].apply(convert_runtime)

In [None]:
def convert_Year(val):
    """
    Convert the string number value to a float
     - Remove $
     - Remove commas
     - Convert to float type
    """
    new_val = val.replace('(','').replace(')', '').replace(' Video', '').replace(' TV Special','').replace('III ','').replace('–','').replace(' TV Movie','')
    return int(new_val)

In [None]:
converted_Year = dframe['Year'].apply(convert_Year)

In [None]:
converted_Year.value_counts

In [None]:
dframe.columns

In [None]:
def convert_score(val):
    """
    Convert the string number value to a float
     - Remove $
     - Remove commas
     - Convert to float type
    """
    
    return int(val)

In [None]:
def convert_votes(val):
    """
    Convert the string number value to a float
     - Remove $
     - Remove commas
     - Convert to float type
    """
    new_val = val.replace(',','')
    return int(new_val)

In [None]:
converted_Votes = dframe['Votes'].apply(convert_votes)

In [None]:
converted_IMDB_Score = dframe['IMDB_Score'].apply(convert_score)

In [None]:
dframe.insert(8, "C_IMDB_Score", converted_IMDB_Score, True)
dframe.insert(1, "C_Year", converted_Year, True)
dframe.insert(3, "C_RunTime", converted_RunTime, True)
dframe.insert(9, "C_Votes", converted_Votes, True)

In [None]:
dframe = dframe.drop(['IMDB_Score', 'Year', 'RunTime', 'Votes'], axis = 1)

In [None]:
dframe.head()

In [None]:
sns.catplot(x="Category", y="C_IMDB_Score", kind = 'bar', hue='Rating', palette="vlag", data=dframe)

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(10, 7)
plt.ylim(0, 10)
plt.title("Distribution of Rating by IMDB Score, Separated by Category")
sns.boxenplot(x='Rating', y='C_IMDB_Score', data=dframe, palette='hls', hue='Category',linewidth=2.5)
plt.legend(loc='lower right', fancybox=True)
sns.despine()

In [None]:
dframe.head()

In [None]:
group_avg_rating = dframe['C_IMDB_Score'].groupby(dframe['Category'])

In [None]:
group_avg_rating.mean() #average rating for DC is more than Marvel

In [None]:
dframe.groupby(['Category']).size() #this is a bias dataset as there are only 257 Marvel movies in the cleaned dataset 

In [None]:
group_avg_runtime = dframe['C_RunTime'].groupby(dframe['Category'])

In [None]:
group_avg_runtime.mean()

In [None]:
group_avg_Votes = dframe['C_Votes'].groupby(dframe['Category'])

In [None]:
group_avg_Votes.mean()

In [None]:
dframe_rating = dframe.groupby('C_IMDB_Score')

In [None]:
dframe_rating.describe()

In [None]:
def ranker(df):
    df['movie_imdb_score_rank'] = np.arange(len(df)) + 1
    return df

In [None]:
dframe.sort_values('C_IMDB_Score', ascending=False, inplace=True)

In [None]:
dframe = dframe.groupby('Category').apply(ranker)

In [None]:
dframe.head()

In [None]:
num_score = dframe['C_IMDB_Score'].value_counts()

In [None]:
dframe[dframe.C_IMDB_Score ==9].head(len(num_score))

In [None]:
dframe.info()

## Applying weighted Average 

In [None]:
v = dframe['C_IMDB_Score']        # Number of votes for the movie
R = dframe['C_Votes']             # Vote average of a movie which is a number from 0 to 10
C = dframe['C_Votes'].mean()      # The mean vote across the whole report 
m = dframe['C_IMDB_Score'].quantile(0.7)

In [None]:
dframe['Weighted_average'] = ((R*v)+(C*m))/(v+m)

In [None]:
dframe.head()

In [None]:
movie_rankings = dframe.sort_values('Weighted_average',ascending=False)
movie_rankings[['Movie','C_Votes','C_IMDB_Score', 'movie_imdb_score_rank','Weighted_average']]

In [None]:
#Compare the top 6 movies 

weighted_average = dframe.sort_values('Weighted_average',ascending=False)
plt.figure(figsize=(7,5))
axis1 = sns.barplot(x = dframe['Weighted_average'].head(10),y = dframe['Movie'].head(10))

plt.title('Best Movies according to Average Vote given by Users',weight='bold')
plt.xlabel('Weighted Average Score',weight='bold')
plt.ylabel('Movie Title',weight='bold')