# Importing the required libraries and packages 

In [None]:
import numpy as np
import pandas as pd

#!pip install pandas-profiling

import datetime

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn_pandas import DataFrameMapper
from IPython.display import Image

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Reading the data

In [None]:
tmdb = pd.read_csv("../input/tmdb-box-office-prediction/train.csv")

In [None]:
tmdb_test = pd.read_csv("../input/tmdb-box-office-prediction/test.csv")


In [None]:
tmdb.head()

In [None]:
tmdb.columns

In [None]:
tmdb.shape

In [None]:
tmdb_test.head()

In [None]:
tmdb_test.columns

In [None]:
tmdb_test.shape

In [None]:
tmdb.describe(include='all')

### Checking Datatypes & Typecasting obvious ones

In [None]:
tmdb.dtypes

In [None]:
tmdb_test.dtypes

In [None]:
import pandas_profiling

pandas_profiling.ProfileReport(tmdb)

In [None]:
pandas_profiling.ProfileReport(tmdb_test)

Observations

id is Categorical but is interpreted as int64

homepage, imdb_id, original_language, original_title, overview, poster_path,,release_date, status, tagline, title,  belong to Categorical ,
but are interpreted as object


 

'belongs_to_collection', 'genres', 'production_companies','production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew are containing dictionaries in each row with a int,float,string etc. 
So we will deal with these columns later 

Data type conversion

Using astype('category') to  homepage, imdb_id, original_language, original_title, overview, poster_path,  release_date,  status, tagline, title, attributes to categorical attributes.

In [None]:
for col in ['id', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview','poster_path','release_date','status','tagline','title']:
    tmdb[col] = tmdb[col].astype('category')

In [None]:
for col in ['id','homepage', 'imdb_id', 'original_language', 'original_title', 'overview','poster_path','release_date','status','tagline','title']:
    tmdb_test[col] = tmdb_test[col].astype('category')

### Display data type of each variable

In [None]:
tmdb.dtypes

In [None]:
tmdb_test.dtypes

### Checking missing values

In [None]:
tmdb.isnull().sum()


In [None]:
tmdb_test.isnull().sum()

In [None]:
missing=tmdb.isna().sum().sort_values(ascending=False)
sns.barplot(missing[:8],missing[:8].index)
plt.show()

#### Observation: belongs_to_collection attribute has highest number of missing data, we can deal with it later

### checking unique values

In [None]:

### Checking Unique Values for all attribute
tmdb.nunique()

In [None]:
### Checking Unique Values for all attribute
tmdb_test.nunique()

In [None]:
## Checking Unique values = 1 for all columns in given data
tmdb.columns[tmdb.nunique() <= 1]

In [None]:


#Checking columns with all unique values 
tmdb.columns[tmdb.nunique() == 3000]

In [None]:
## Checking Unique values = 1 for all columns in test data
tmdb_test.columns[tmdb_test.nunique() <= 1]

In [None]:
#Checking columns with all unique values in test data
tmdb_test.columns[tmdb_test.nunique() == 4398]

#### Observation

Clearly id, imdb_id are unique for each row and offers zero variance in given data. Hence it can be deleted.

Similary for test data : id , imdb_id are unique for each row and offers zero variance in given data. Hence it can be deleted

### Dropping Unwanted Attributes :

Removing Zero Variance Attributes based on the above observation 


In [None]:
tmdb = tmdb.drop(['id','imdb_id'],axis = 1)

In [None]:
tmdb_test = tmdb_test.drop(['id','imdb_id'],axis=1)

Removing 'poster_path','overview','homepage','tagline','original_title','title','original_language','status' as they do not contribute to the target

In [None]:
tmdb = tmdb.drop(['poster_path','overview','homepage','tagline','original_title','title','original_language','status'],axis = 1)

In [None]:
tmdb_test = tmdb_test.drop(['poster_path','overview','homepage','tagline','original_title','title','original_language', 'status'],axis = 1)

# Handling Numerical Attributes

In [None]:
tmdb.select_dtypes(include=[np.number]).columns

In [None]:
tmdb.select_dtypes(include=[np.number]).head()


In [None]:
tmdb.describe(include=[np.number]).head()

Revenue (Target Attribute)

In [None]:
tmdb['revenue'].value_counts()

Budget

In [None]:
tmdb['budget'].value_counts()

#### Observation : 

You can see a wide range of movies are having a budget of 0 $ which do not make sense, as any film would cost certain minimum budget


### Histogram Plot

Distribution of Revenue

In [None]:
fig, ax = plt.subplots(figsize = (16, 6))
plt.subplot(1, 2, 1)
plt.hist(tmdb['revenue']);
plt.title('Distribution of revenue');
plt.subplot(1, 2, 2)
plt.hist(np.log1p(tmdb['revenue']));
plt.title('Distribution of log of revenue');


Because revenue variable is right skewed, we calculated the log values

Distribution of Budget

In [None]:
fig, ax = plt.subplots(figsize = (16, 6))
plt.subplot(1, 2, 1)
plt.hist(tmdb['budget']);
plt.title('Distribution of budget');
plt.subplot(1, 2, 2)
plt.hist(np.log1p(tmdb['budget']));
plt.title('Distribution of log of budget');


Because budget variable is right skewed, we calculated the log value 

#### Scatter Plot

In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=go.Scatter(
                x=tmdb['budget'],
                y=tmdb['revenue'],
                mode='markers',
                marker=dict(
                     color='rgb(255, 178, 102)',
                     size=10,
                     line=dict(
                        color='DarkSlateGrey',
                        width=1
                      )
               )
))
fig.update_layout(
    title='Revenue by Budget',
    xaxis_title='budget ($)',
    yaxis_title='revenue ($)'
)
fig.show()

In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=go.Scatter(
                x=tmdb['runtime'],
                y=tmdb['revenue'],
                mode='markers',
                marker=dict(
                     color='rgb(48, 105, 152)',
                     size=10,
                     line=dict(
                        color='DarkSlateGrey',
                        width=1
                      )
               )
))
fig.update_layout(
    title='Revenue by Runtime',
    xaxis_title='runtime (minutes)',
    yaxis_title='revenue ($)'
)
fig.show()

In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=go.Scatter(
                x=tmdb['popularity'],
                y=tmdb['revenue'],
                mode='markers',
                marker=dict(
                     color='rgb(108, 198, 68)',
                     size=10,
                     line=dict(
                        color='DarkSlateGrey',
                        width=1
                      )
               )
))
fig.update_layout(
    title='Revenue by Popularity',
    xaxis_title='popularity',
    yaxis_title='revenue ($)'
)
fig.show()

#### Correlation between Numeric Attributes

In [None]:
def plot_corr(tmdb,filename):
    plt.subplots(figsize=(12, 9))
    sns.heatmap(tmdb.corr(),annot=True,linewidths=.5,annot_kws={"fontsize":15})
    plt.yticks(rotation=0,fontsize=15)
    plt.xticks(rotation=0,fontsize=15)
    plt.show()

plot_corr(tmdb[["revenue","budget","popularity","runtime"]],filename="corr.png")

#### Observation:
We can see that budget is highly correlated with revenue, popularity is farily correlated with revenue and runtime is poorly correlated with the revenue

Logarithm transformation makes budget distribution more managable.

#### Transformation of Attribute required 

In [None]:
tmdb['log_budget'] = np.log1p(tmdb['budget'])
tmdb_test['log_budget'] = np.log1p(tmdb_test['budget'])

In [None]:
tmdb = tmdb.drop(['budget'],axis = 1)
tmdb_test = tmdb_test.drop(['budget'],axis = 1)

# Handling Special Attributes

Some of the columns have dictionaries type  containing id, name and path in each row, so we need to extract/filter them to our requirements

In [None]:
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']


In [None]:
import ast

In [None]:
def text_to_dict(df):
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df
        
tmdb = text_to_dict(tmdb)
tmdb_test = text_to_dict(tmdb_test)

# genres

In [None]:
for i, e in enumerate(tmdb['genres'][:5]):
    print(i, e)


In [None]:
print('Number of genres in films')
tmdb['genres'].apply(lambda x: len(x) if x != {} else 0).value_counts()

Genres column contains named and ids of genres to which films belong. Most of films have 2-3 genres and 5-6 genres are possible. 0 and 7 are outliers, I think. Let's extract genres! I'll create a column with all genres in the film and also separate columns for each genre.

In [None]:
list_of_genres = list(tmdb['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [None]:
## Finding top Genre
#!pip install WordCloud


In [None]:
from wordcloud import WordCloud
from collections import Counter

In [None]:
plt.figure(figsize = (12, 8))
text = ' '.join([i for j in list_of_genres for i in j])
wordcloud = WordCloud(max_font_size=None, background_color='white', collocations=False,
                      width=1200, height=1000).generate(text)
plt.imshow(wordcloud)
plt.title('Top genres')
plt.axis("off")
plt.show()


In [None]:
## Using Counter

In [None]:
Counter([i for j in list_of_genres for i in j]).most_common()

#### Observation: We can see that Drama, Comedy and Thriller are in the top of genres list

#### Feature Engineering :
I'm going to create a new column with only 'genre names' by extracting only names from the 'genres' column

In [None]:
tmdb['genres_names'] = tmdb['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

In [None]:
tmdb_test['genres_names'] = tmdb_test['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

In [None]:
### So now I can drop 'genres' column

tmdb = tmdb.drop(['genres'], axis=1)
tmdb_test = tmdb_test.drop(['genres'], axis=1)

In [None]:
tmdb.head()

In [None]:
tmdb_test.head()

# belongs_to_collection

The strings in belongs_to_collection are untidy and contain information which are not needed. 
Lets use regular expressions to extract the collection names from these strings.

In [None]:
for i, e in enumerate(tmdb['belongs_to_collection'][:5]):
    print(i, e)


In [None]:
print('Number of collections in films')
tmdb['belongs_to_collection'].apply(lambda x: len(x) if x != {} else 0).value_counts()

Only 604 collections are available in this column. Lets us extract the names 

In [None]:
list_collection_names = list(tmdb['belongs_to_collection'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [None]:
Counter([i for j in list_collection_names for i in j]).most_common(15)

#### observation : From the list of collections we can see that James Bond collection movie is trending at top

#### Feature Engineering: 
I'm going to create a new cloumn extracting only collection names from the 'belongs_to_collection'column.


In [None]:
tmdb['collections_names'] = tmdb['belongs_to_collection'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

In [None]:
tmdb_test['collections_names'] = tmdb_test['belongs_to_collection'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

In [None]:
### So now I can drop 'belongs_to_collection' column

tmdb = tmdb.drop(['belongs_to_collection'], axis=1)
tmdb_test = tmdb_test.drop(['belongs_to_collection'], axis=1)

In [None]:
tmdb.head()

# production_companies

In [None]:
for i, e in enumerate(tmdb['production_companies'][:5]):
    print(i, e)

In [None]:
print('Number of production companies in films')
tmdb['production_companies'].apply(lambda x: len(x) if x != {} else 0).value_counts()

observation :Most of films have 1-2 production companies, cometimes 3-4. But there are films with 10+ companies! Let's have a look at some of them

In [None]:
list_of_companies = list(tmdb['production_companies'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [None]:
Counter([i for j in list_of_companies for i in j]).most_common(20)

#### Observation : We can see that the top production companies are Warner Bros, Universal Pictures , Paramount Pictures,etc

#### Feature Engineering: 
I'm going to create a new cloumn extracting only Production companies names from the 'production_companies'column.

In [None]:
tmdb['production_names'] = tmdb['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

In [None]:
tmdb_test['production_names'] = tmdb_test['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

In [None]:
### So now I can drop 'production_companies' column

tmdb = tmdb.drop(['production_companies'], axis=1)
tmdb_test = tmdb_test.drop(['production_companies'], axis=1)

In [None]:
tmdb.head()

In [None]:
tmdb_test.head()

# production_countries

In [None]:
for i, e in enumerate(tmdb['production_countries'][:5]):
    print(i, e)

In [None]:
print('Number of production countries in films')
tmdb['production_countries'].apply(lambda x: len(x) if x != {} else 0).value_counts()

In [None]:
list_of_countries = list(tmdb['production_countries'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_countries for i in j]).most_common(25)

#### Observation:We can see that the top production country is United States of America, followed by UK, France, etc.

In [None]:
tmdb['production_countries_names'] = tmdb['production_countries'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values

In [None]:
tmdb_test['production_countries_names'] = tmdb_test['production_countries'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values

In [None]:
### So now I can drop 'production_countries' column

tmdb = tmdb.drop(['production_countries'], axis=1)
tmdb_test = tmdb_test.drop(['production_countries'], axis=1)

In [None]:
tmdb.head()

# spoken_languages

In [None]:
for i, e in enumerate(tmdb['spoken_languages'][:5]):
    print(i, e)

In [None]:
print('Number ofspoken languages in films')
tmdb['spoken_languages'].apply(lambda x: len(x) if x != {} else 0).value_counts()

In [None]:
list_of_Spoken_Languages = list(tmdb['spoken_languages'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [None]:
plt.figure(figsize = (12, 8))
text = ' '.join([i for j in list_of_Spoken_Languages for i in j])
wordcloud = WordCloud(max_font_size=None, background_color='white', collocations=False,
                      width=1200, height=1000).generate(text)
plt.imshow(wordcloud)
plt.title('Top Spoken Languages')
plt.axis("off")
plt.show()
Counter([i for j in list_of_Spoken_Languages for i in j]).most_common(25)

#### From above we can see that English, Francias, Espanol etc are the Top Spoken Languages

In [None]:
tmdb['language_names'] = tmdb['spoken_languages'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values

In [None]:
tmdb_test['language_names'] = tmdb_test['spoken_languages'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values

In [None]:
### So now I can drop 'spoken_languages' column

tmdb = tmdb.drop(['spoken_languages'], axis=1)
tmdb_test = tmdb_test.drop(['spoken_languages'], axis=1)

In [None]:
tmdb.head()

# Cast 

In [None]:
for i, e in enumerate(tmdb['cast'][:5]):
    print(i, e)

In [None]:
print('Number of casts in films')
tmdb['cast'].apply(lambda x: len(x) if x != {} else 0).value_counts()

In [None]:
list_of_cast = list(tmdb['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [None]:
Counter([i for j in list_of_cast for i in j]).most_common(25)

#### Observation: We can see that "Samuel L. Jackson" and " Robert De Niro" are the top casts in the films who have been featured maximum times

In [None]:
tmdb['cast_names'] = tmdb['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values

In [None]:
tmdb_test['cast_names'] = tmdb_test['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values

In [None]:
### So now I can drop 'cast' column

tmdb = tmdb.drop(['cast'], axis=1)
tmdb_test = tmdb_test.drop(['cast'], axis=1)

In [None]:
tmdb.head()

In [None]:
tmdb_test.head()

In [None]:
## Dropping Key Words and Crew as they do not contribute to revenue 

In [None]:
tmdb = tmdb.drop(['Keywords','crew'], axis=1)

In [None]:
tmdb_test =  tmdb_test.drop(['Keywords','crew'],axis=1)

# release_date

In [None]:
tmdb.columns

In [None]:
##Feature engineering

In [None]:
tmdb['release_datetime'] = pd.to_datetime(tmdb['release_date'])
tmdb['release_day'] = tmdb['release_datetime'].dt.day
tmdb['release_month']=tmdb['release_datetime'].dt.month
tmdb['release_year'] = tmdb['release_datetime'].dt.year
tmdb['release_weekday']=tmdb['release_datetime'].dt.weekday

In [None]:
tmdb[['release_datetime', 'release_day', 'release_month', 'release_year', 'release_weekday']]

#### release day of week


In [None]:
day=tmdb['release_weekday'].value_counts().sort_index()
sns.barplot(day.index,day)
plt.gca().set_xticklabels(["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],rotation='45')
plt.ylabel('No of releases')

In [None]:
tmdb.head()

Observation : We can see that 'Friday' has maximum number of releases as per our data

We can drop 'release_day' and 'release_month', as they do not affect our target 'revenue'. 'release_weekday' and 'release_year' are more important

In [None]:
tmdb = tmdb.drop(['release_date','release_datetime','release_day','release_month','release_year'], axis=1)

In [None]:
## same thing for test set
tmdb_test['release_datetime'] = pd.to_datetime(tmdb_test['release_date'])
tmdb_test['release_day'] = tmdb_test['release_datetime'].dt.day
tmdb_test['release_month']=tmdb_test['release_datetime'].dt.month
tmdb_test['release_year'] = tmdb_test['release_datetime'].dt.year
tmdb_test['release_weekday']=tmdb_test['release_datetime'].dt.weekday

In [None]:
tmdb_test[['release_datetime', 'release_day', 'release_month', 'release_year', 'release_weekday']]

In [None]:
tmdb_test = tmdb_test.drop(['release_date','release_datetime','release_day','release_month','release_year'], axis=1)

#### Let's check the relation between Revenue and release week 

In [None]:
sns.catplot(x='release_weekday',y='revenue',data=tmdb)
plt.gca().set_xticklabels(["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],rotation='90')
plt.show()

#### This is odd , even though we see that 'Friday' has maximum releases, the maximum revenue is seen on Wednesday. This explains that more people are watching films on wednesday

#### Typecasting newly created features

In [None]:
tmdb.head()

In [None]:
tmdb.dtypes

In [None]:
tmdb_test.dtypes

In [None]:
for col in ['genres_names','collections_names','production_names','release_weekday']:
    tmdb[col] = tmdb[col].astype('category')

In [None]:
for col in ['genres_names','collections_names','production_names','release_weekday']:
    tmdb_test[col] = tmdb_test[col].astype('category')

In [None]:
tmdb.dtypes

In [None]:
tmdb_test.dtypes

#### let's use Dummyfication 

#### We will convert them into dummies 

#### Top genre (dummyfication)

In [None]:
top_genre = [m[0] for m in Counter([i for j in list_of_genres for i in j]).most_common(15)]
for g in top_genre:
    tmdb['genre_' + g] = tmdb['genres_names'].apply(lambda x: 1 if g in x else 0)
for g in top_genre:
    tmdb_test['language_' + g] = tmdb_test['genres_names'].apply(lambda x: 1 if g in x else 0)
tmdb = tmdb.drop(['genres_names'], axis=1)
tmdb_test = tmdb_test.drop(['genres_names'], axis=1)

#### Top collection names (dummyfication)

In [None]:
top_collections = [m[0] for m in Counter([i for j in list_collection_names for i in j]).most_common(15)]
for g in top_collections:
    tmdb['collection_' + g] = tmdb['collections_names'].apply(lambda x: 1 if g in x else 0)
for g in top_collections:
    tmdb_test['collection_' + g] = tmdb_test['collections_names'].apply(lambda x: 1 if g in x else 0)
tmdb = tmdb.drop(['collections_names'], axis=1)
tmdb_test = tmdb_test.drop(['collections_names'], axis=1)

#### Top production countries

In [None]:
top_production_countries = [m[0] for m in Counter([i for j in list_of_countries for i in j]).most_common(25)]
for g in top_production_countries:
    tmdb['production_country_name' + g] = tmdb['production_countries_names'].apply(lambda x: 1 if g in x else 0)


In [None]:
for g in top_production_countries:
    tmdb_test['production_country_name' + g] = tmdb_test['production_countries_names'].apply(lambda x: 1 if g in x else 0)
    

In [None]:
tmdb = tmdb.drop(['production_countries_names'], axis=1)
tmdb_test = tmdb_test.drop(['production_countries_names'], axis=1)

#### Top production names

In [None]:
top_productions = [m[0] for m in Counter([i for j in list_of_companies for i in j]).most_common(15)]
for g in top_collections:
    tmdb['productions_' + g] = tmdb['production_names'].apply(lambda x: 1 if g in x else 0)
for g in top_productions:
    tmdb_test['productions_' + g] = tmdb_test['production_names'].apply(lambda x: 1 if g in x else 0)
tmdb = tmdb.drop(['production_names'], axis=1)
tmdb_test = tmdb_test.drop(['production_names'], axis=1)

#### Top language_names (dummyfication)

In [None]:
top_languages = [m[0] for m in Counter([i for j in list_of_Spoken_Languages for i in j]).most_common(30)]
for g in top_languages:
    tmdb['language_' + g] = tmdb['language_names'].apply(lambda x: 1 if g in x else 0)
for g in top_languages:
    tmdb_test['language_' + g] = tmdb_test['language_names'].apply(lambda x: 1 if g in x else 0)
    

In [None]:
tmdb = tmdb.drop(['language_names'], axis=1)
tmdb_test = tmdb_test.drop(['language_names'], axis=1)

#### Top cast_names (dummyfication)

In [None]:
top_cast = [m[0] for m in Counter([i for j in list_of_cast for i in j]).most_common(30)]
for g in top_cast:
    tmdb['cast_' + g] = tmdb['cast_names'].apply(lambda x: 1 if g in x else 0)
for g in top_cast:
    tmdb_test['cast_' + g] = tmdb_test['cast_names'].apply(lambda x: 1 if g in x else 0)
    

In [None]:
tmdb = tmdb.drop(['cast_names'], axis=1)
tmdb_test = tmdb_test.drop(['cast_names'], axis=1)

In [None]:
pd.set_option('display.max_columns', None)


In [None]:
tmdb.head()

In [None]:
tmdb.shape

In [None]:
tmdb_test.head()

# Test Data

In [None]:
tmdb_test.head()

In [None]:
tmdb_test.dtypes

In [None]:
tmdb_test.shape

In [None]:
tmdb_test.isna().sum()

In [None]:
tmdb_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Split the data into train and test

In [None]:
X = tmdb.drop(['revenue'], axis=1)
y = np.log1p(tmdb['revenue'])
X_test = tmdb_test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20,random_state=123)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

# Split the attributes into numerical and categorical types

In [None]:
num_attr=X_train.select_dtypes(['int64','float64']).columns
num_attr

In [None]:
cat_attr = X_train.select_dtypes('category').columns
cat_attr

In [None]:
num_attr_test = X_test.select_dtypes(['int64','float64']).columns

In [None]:
num_attr_test

In [None]:
cat_attr_test = X_test.select_dtypes('category').columns
cat_attr_test

# Preparation for Model buillding

# Imputing missing values with mean for numerical attributes

In [None]:
imputer = SimpleImputer(strategy='mean')

imputer = imputer.fit(X_train[num_attr])

X_train[num_attr] = imputer.transform(X_train[num_attr])
X_val[num_attr] = imputer.transform(X_val[num_attr])



In [None]:
X_test[num_attr_test] = imputer.transform(X_test[num_attr_test])


In [None]:
print(X_train.isnull().sum())
print(X_val.isnull().sum())

# Imputing missing values with mode for categorical attributes

In [None]:
imputer = SimpleImputer(strategy='most_frequent')

imputer = imputer.fit(X_train[cat_attr])

X_train[cat_attr] = imputer.transform(X_train[cat_attr])
X_val[cat_attr] = imputer.transform(X_val[cat_attr])


In [None]:
X_test[cat_attr_test] = imputer.transform(X_test[cat_attr_test])

# Standardizing the numerical attributes and One-hot encoding categorical attributes

In [None]:
# DataFrameMapper, a class for mapping pandas data frame columns to different sklearn transformations
mapper = DataFrameMapper(
  [([continuous_col], StandardScaler()) for continuous_col in num_attr] +
  [([categorical_col], OneHotEncoder(handle_unknown='error')) for categorical_col in cat_attr]
, df_out=True)

In [None]:
print(type(mapper))

In [None]:
mapper.fit(X_train)

X_train_final = mapper.transform(X_train)
X_val_final = mapper.transform(X_val)

In [None]:
# DataFrameMapper, a class for mapping pandas data frame columns to different sklearn transformations
mapper_test = DataFrameMapper(
  [([continuous_col], StandardScaler()) for continuous_col in num_attr_test] +
  [([categorical_col], OneHotEncoder(handle_unknown='error')) for categorical_col in cat_attr_test]
, df_out=True)

In [None]:
print(type(mapper_test))

In [None]:
mapper_test.fit(X_test)

X_test_final = mapper_test.transform(X_test)


In [None]:
X_test_final.head()

# Final train and val data

In [None]:
X_train_final.head()

In [None]:
X_train_final.columns

In [None]:
X_val_final.head()

In [None]:
X_val_final.columns

# Model Building

### 1. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Model initialization
regression_model = LinearRegression()



In [None]:
# Fit the data(train the model)
regression_model.fit(X_train_final, y_train)

In [None]:
##Our model has now been trained. You can analyse each of the model’s coefficients using the following statement :
print(regression_model.coef_)

In [None]:
#A nicer way to view the coefficients is by placing them in a DataFrame. This can be done with the following statement:
pd.DataFrame(regression_model.coef_, X_train_final.columns, columns = ['Coeff'])

#### Observation : From the above coefficeints, we can see that 'budget' has high coefficient value, which means 'budget' has highest impact on the target 'revenue'

In [None]:
# Predict
predictions = regression_model.predict(X_train_final)

#### Evaluating the model performance


In [None]:
from sklearn import metrics
rmse = mean_squared_error(y_train,predictions)
r2 = r2_score(y_train,predictions)

#### printing values

In [None]:
print('Intercept:', regression_model.intercept_)
print('Root mean squared error: ', rmse)
print('R2 score: ', r2)

In [None]:
X_test_final = X_test_final.replace([np.inf, -np.inf], 0).fillna(0)

In [None]:
predictions_test = np.expm1(regression_model.predict(X_test_final))

In [None]:
submission = pd.read_csv('../input/tmdb-box-office-prediction/sample_submission.csv')

In [None]:
submission['revenue'] = np.round(predictions_test)
submission.to_csv('submission_linear_regression.csv', index = False)

# 2. Random Forest

#### Functions to evaluate our Random Forest

In [None]:
def rmse(y_pred, y_true):
    return np.sqrt(mean_squared_error(y_pred, y_true))

def print_rf_score(model):
    print(f'Train R2:   {model.score(X_train_final, y_train)}')
    print(f'Valid R2:   {model.score(X_val_final, y_val)}')
    print(f'Train RMSE: {rmse(model.predict(X_train_final), y_train)}')
    print(f'Valid RMSE: {rmse(model.predict(X_val_final), y_val)}')

In [None]:
from sklearn.ensemble import RandomForestRegressor


In [None]:
rf = RandomForestRegressor(n_estimators = 40, random_state = 25)
rf.fit(X_train_final,y_train)


In [None]:
print_rf_score(rf)

## 3. Random Forest with hyper parameters


In [None]:
rf= RandomForestRegressor(n_estimators=40, min_samples_leaf=10, max_features=0.5, n_jobs=-1, oob_score=True)
rf.fit(X_train_final, y_train)


In [None]:
rf.fit(X_train_final, y_train)
print_rf_score(rf)
print(f'OOB Score:  {rf.oob_score_}')

### Feature importance

In [None]:
feature_importances = pd.DataFrame(rf.feature_importances_, index = X_train_final.columns, columns=['importance'])
feature_importances

In [None]:
feature_importances = pd.DataFrame(rf.feature_importances_ , index = X_train_final.columns, columns=['importance'])
feature_importances = feature_importances.sort_values('importance', ascending=True)
feature_importances.plot(kind = 'barh', figsize = (15,60))
plt.show()

#### Predictions for Random Forest model

In [None]:
predictions_test = np.expm1(rf.predict(X_test_final))

In [None]:
submission = pd.read_csv('../input/tmdb-box-office-prediction/sample_submission.csv')

In [None]:
submission['revenue'] = np.round(predictions_test)
submission.to_csv('submission_simple_rf.csv', index = False)

# END 