# Verzeo - Minor Project (ML-JULY-B1)

### Done by : Sanjay Marreddi  
### Email Id  : sanjay.marreddi.19041@iitgoa.ac.in
    

#### First let us import the required Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#### Let us read the given Dataset

In [None]:
movies = pd.read_csv("tmdb-movies.csv")

## Cleaning the Data and Performing EDA

In [None]:
movies.head()

In [None]:
movies.shape

In [None]:
movies.columns.tolist() # Storing all the Column Names in a List

In [None]:
movies.describe()

In [None]:
movies.info()

In [None]:
movies.isnull().sum() # Checking the Missing Values in each Column of the DataFrame

#### Note: From the results, It can be observed that all the Columns containing NaN are of type Object only

## Data Visualisation with seaborn

#### Analysing `popularity` feature of the given Movies Dataset

In [None]:
# Plotting the Distribution Plot along with Gaussian Kernel Density Estimate 
sns.distplot(movies['popularity'],kde=True,bins=50,color="Orange",hist_kws=dict(edgecolor="b", linewidth=1.5))

#### Finding the value of bar using Bin Size

In [None]:
print("Minimum popularity in the data is:",movies.popularity.min())
print("Maximum popularity in the data is:",movies.popularity.max())
print("Range of popularity is from {} to {}, value is {}".format(movies.popularity.min(),movies.popularity.max(),movies.popularity.max()-movies.popularity.min()))
print("I used a bin size of 50, so each bar corresponds to the value of:",(movies.popularity.max()-movies.popularity.min())/50)

In [None]:
# Plotting only the Histogram using Different Bin Size
sns.distplot(movies['popularity'],kde=False,hist=True,bins=20,color= "Orange",hist_kws=dict(edgecolor="b", linewidth=1.0))

In [None]:
# Plotting the Box Plot 
sns.boxplot(movies['popularity'],color ="Orange")

# Evaluating the Percentiles and Interquartile range (IQR)
Q3 = movies.popularity.quantile(.75)
Q1 = movies.popularity.quantile(.25)
IQR = Q3 - Q1

# Finding the Median
Median = movies.popularity.median()
print("Q1 Value:",Q1)
print("Median Value:",movies.popularity.median())
print("Q3 Value:",Q3)
print("Upper whisker limit:",(Q3 + 1.5*IQR))
print("Lower whisker limit:",(Q1 - 1.5*IQR))

In [None]:
# Checking the Symmetry of the popularity Feature
sns.violinplot(x='popularity',data=movies,color ="Orange")

#### Analysing `budget` feature of the given Movies Dataset

In [None]:
# Plotting the Distribution Plot along with Gaussian Kernel Density Estimate 
sns.distplot(movies['budget'],kde=True,bins=20,color="green")

#### Finding the value of bar using Bin Size

In [None]:
print("Minimum budget in the data is:",movies.budget.min())
print("Maximum budget in the data is:",movies.budget.max())
print("Range of budget is from {} to {}, value is {}".format(movies.budget.min(),movies.budget.max(),movies.budget.max()-movies.budget.min()))
print("I used a bin size of 20, so each bar corresponds to the value of:",(movies.budget.max()-movies.budget.min())/20)

In [None]:
# Plotting only the Histogram using Different Bin Size
sns.distplot(movies['budget'],kde=False,hist=True,bins=10,color= "Green",hist_kws=dict(edgecolor="g", linewidth=1.0))

In [None]:
# Plotting the Box Plot 
sns.boxplot(movies['budget'],color ="Green")

# Evaluating the Percentiles and Interquartile range (IQR)
Q3 = movies.budget.quantile(.75)
Q1 = movies.budget.quantile(.25)
IQR = Q3 - Q1

# Finding the Median
Median = movies.budget.median()
print("Q1 Value:",Q1)
print("Median Value:",movies.budget.median())
print("Q3 Value:",Q3)
print("Upper whisker limit:",(Q3 + 1.5*IQR))
print("Lower whisker limit:",(Q1 - 1.5*IQR))

In [None]:
# Checking the Symmetry of the budget Feature
sns.violinplot(y='budget',data=movies,color="Green")

#### Combined Plots of `budget` and `popularity` Features of the given Movies Dataset

In [None]:
sns.jointplot(x="budget", y="popularity", data=movies,size=10)

In [None]:
sns.lmplot('budget', 'popularity',data=movies,order=2)

### Heatmaps
Using **Correlation** to measure how strong a relationship is between two variables. 

In [None]:
movies.corr()

In [None]:
# HaetMap of the Correlation Matrix
sns.heatmap(movies.corr())

In [None]:
sns.pairplot(movies)

#### Preprocessing the Outliers in `budget` feature of the given DataSet

  #### Using IQR

In [None]:
movies.budget.median()

In [None]:
sns.boxplot(x = movies['budget'])

In [None]:
Q1 = movies.budget.quantile(0.25)
print(Q1)
Q3 = movies.budget.quantile(0.75)
print(Q3)
IQR = Q3 - Q1
print(IQR)
print(Q1 - (1.5 * IQR))
print(Q3 + (1.5 * IQR))

#### Imputation

In [None]:
movies[~((movies.budget < (Q1 - 1.5 * IQR)) |(movies.budget > (Q3 + 1.5 * IQR)))].budget.median()

In [None]:
movies.loc[movies['budget']< (Q3 + (1.5 * IQR)), 'budget'].median()

In [None]:
median = movies.loc[movies['budget']< (Q3 + (1.5 * IQR)), 'budget'].median()
movies.loc[movies.budget > (Q3 + (1.5 * IQR)) , 'budget'] = median

In [None]:
sns.boxplot(x = movies['budget'])

#### Removing the Less Important Features


In [None]:
movies = pd.read_csv("tmdb-movies.csv")
movies_copy= movies.copy()

In [None]:
movies = movies[['release_year','cast','budget','revenue','genres','runtime','original_title','production_companies']]

In [None]:
movies = movies[movies['budget'] != 0]
movies = movies[movies['revenue'] != 0]

#### Removing the rows with NaN Values

In [None]:
movies=movies.dropna()
movies.shape

## Solving the Given Questions

### 1) Which are the movies with the third lowest and third highest budget?

In [None]:

print("The third lowest budget is ",movies.sort_values(['budget']).iloc[2]["budget"])
print("The movie with the third lowest budget is :- ", movies.sort_values(['budget']).iloc[2]['original_title'],".")
print("-"*120,"\n")
print("The third highest budget is ",movies.sort_values(['budget']).iloc[-3]["budget"])
print("The movie with the third highest budget is :- ",movies.sort_values(['budget']).iloc[-3]['original_title'],".")



### 2) What is the average number of words in movie titles between the year 2000-2005?

In [None]:
movies_req = movies[movies['release_year'].isin([2000, 2001, 2002, 2003, 2004, 2005])]

words = 0

# Going through each row which has "release_year" between 2000-2005.
for i in range(movies_req.shape[0]):
    
    # Splitting based on the Empty Space
    list_of_words = movies_req['original_title'].values[i].split(' ')
    
    words = words + len(list_of_words)

# Computing the Average
avg = words/movies_req.shape[0]

# Rounding off the Number
avg = round(avg)

In [None]:
print("The average number of words in movie titles between the year 2000-2005 are ", avg,".")


### 3) What is the most common Genre for Vin Diesel & Emma Watson movies?

In [None]:
# Initialise two empty Dictionaries 
vd={}
em={}

# Go through each row of DataFrame
for j in range(int(movies_copy.shape[0])):
    
    # Ignore if Values are Missing at "cast"feature 
    if type(movies_copy.cast[j]) == str:
        
        # Creating a Dict that has Combined String of Genres related to movies_copy in which "Vin Diesel" is present 
        if "Vin Diesel" in movies_copy.cast[j] :
            if movies_copy.genres[j] in vd:
                vd[movies_copy.genres[j]]+=1
            else:
                vd[movies_copy.genres[j]] = 1
        
        # Creating a Dict that has Combined String of Genres related to movies_copy in which "Emma Watson" is present 
        if "Emma Watson" in movies_copy.cast[j] :
            if movies_copy.genres[j] in em:
                em[movies_copy.genres[j]]+=1
            else:
                em[movies_copy.genres[j]] = 1

        

V={}

# Finding the count of each genre in entire Dataset using previous dictionaries related 
# to movies_copy in which "Vin Diesel" is present 

for k,v in vd.items():
    tem= k.split("|") # Splitting based on "|" as delimeter
    for ea in tem:
        if ea in V:
            V[ea]+=1*int(v)
            
        else:
            V[ea]=1*int(v)
    
    
            
E={}

# Finding the count of each genre in entire Dataset using previous dictionaries related 
# to movies_copy in which "Vin Diesel" is present 

for k2,v2 in em.items():
    tem2= k2.split("|") # Splitting based on "|" as delimeter
    for ea in tem2:
        if ea in E:
            E[ea]+=1*int(v2)
            
        else:
            E[ea]=1*int(v2)
            

In [None]:
# Finding the Keys with max value of Genre Count 

Vmax= max(V, key=V.get)
Emax= max(E, key=E.get)
print(V)
print("The most common Genre for Vin Diesel :",Vmax)
print(E)
print("The most common Genre for Emma Watson :",Emax,"and Adventure")

### 4) Which are the movies with most and least earned revenue?

In [None]:
least = movies.sort_values(['revenue']).iloc[0]["revenue"]
print("The least earned revenue value is",least)

print("The movies with the least earned revenue are :-")
for i in range(movies.shape[0]//1000):
    if movies.sort_values(['revenue']).iloc[i]['revenue'] == least:
        print(movies.sort_values(['revenue']).iloc[i]['original_title'])
        
print("-"*120,"\n")

most = movies.sort_values(['revenue']).iloc[-1]["revenue"]
print("The most earned revenue value is", most)

print("The movie with the most earned revenue is : ", movies.sort_values(['revenue']).iloc[-1]['original_title'])

###  5) What is the average runtime of movies in the year 2006?

In [None]:
run = movies.loc[ (movies["release_year"]== 2006), "runtime" ].tolist()

In [None]:
avg= np.mean(run)

In [None]:
print("The average runtime of movies in the year 2006 is :",avg)

### 6) Name any 3 production companies which have invested money in worse revenue movies?

In [None]:
worse_revenue= movies.loc[movies["revenue"]==least]

In [None]:
companies = worse_revenue['production_companies'].tolist()

In [None]:
d={}

for i in companies:
    tem = i.split("|")
    print(tem)
    for m in tem:
        if m in d:
            d[m]+=1

        else:
            d[m]=1

In [None]:
print(list(d)[:3])

### The End - Sanjay Marreddi
