### IMBD TOP250 Movie Analysis and Recommendation

#### Load libraries

In [217]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import csv
import time
import random

#### Prepare useful information

In [2]:
web = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'
web2 = 'https://www.imdb.com/'
headers = [
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"]

header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}

pattern1 = 'Country:(.*) Language:(.*) Rele.*Budget:(.*)\(.*Gro.*:(.*)Pro.*Co:(.*)See more.*Runtime:(.*)min'
pattern2 = 'Country:(.*) Language:(.*) Rele.*Gro.*:(.*)Pro.*Co:(.*)See more.*Runtime:(.*)min'
pattern3 = 'Country:(.*) Language:(.*) Rele.*Budget:(.*)\(.*Pro.*Co:(.*)See more.*Runtime:(.*)min'
pattern4 = 'Country:(.*) Language:(.*) Rele.*Gro.*:(.*)Pro.*Co:(.*)See more.*Runtime:(.*)min'

#### Function
- Parse the main page of IMBD

In [4]:
# Get soup object
def getSoup(url,header):
    req = requests.get(url,headers=header)
    bs = BeautifulSoup(req.content,'html')
    return bs

# get data
def getData(bs):
    data = bs.find('tbody', attrs = {'class': 'lister-list'})
    # get titles and urls
    film_info = data.find_all('td',attrs={'class':'titleColumn'})
    titles =  [k.find('a').text for k in film_info]
    urls =  [web2 + k.find('a').attrs['href'][:17] for k in film_info]
    years = [k.find('span').text[1:5] for k in film_info]
    # get score
    score_info = data.find_all('td',attrs={'class':'imdbRating'})
    scores = [k.text.replace('\n','') for k in score_info]
    # return data frame
    data = pd.DataFrame({'Name':titles,'Score':scores,'Year':years,'Url':urls})      
    return data

- Parse the details of movie

In [215]:
# ['USA', 'English', '25000000', '28786657', 'Castle Rock Entertainment', '142']
# country, language, budget, gross revenue, production, runtime
def getInfo_1(bs,p = pattern1):
    info1 = bs.find('div',{'class':'article','id':'titleDetails'}).find_all('div',attrs={'class':'txt-block'})
    info2 = ' '.join([s.text.replace('\n','').strip() for s in info1])
    info3 = re.findall(p,info2)
    status = 0
    if not info3: 
        info3, status = re.findall(pattern2,info2), 1
    if not info3: 
        info3, status = re.findall(pattern3,info2), 2
    d1 = [d.strip() for d in info3[0]]
    if len(d1) < 6:
        if status == 1:
            d1.insert(2,'0')
        elif status == 2:
            d1.insert(3,'0')
    temp = []
    for i,n in enumerate(d1):
        if i in (2,3):
            n = ''.join(re.findall(r'\d',n))
        elif i == 5:
            n = ''.join(re.findall(r'\d',n.split('|')[0]))
        temp.append(n)
    return temp

# ['R','Drama','2236434','7728', '225', 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
# ['Frank Darabont'],['Stephen King', 'Frank Darabont'],['Tim Robbins', 'Morgan Freeman', 'Bob Gunton']]
# Rating, Genera, Nums of Score, Nums of Review, Nums of critic, Story, Director, Writer, Stars
def getInfo_2(bs):
    d2 = bs.find('div',{'class':'title_wrapper'}).find_all('div',{'class':'subtext'})
    t1 = [info.strip() for i,info in enumerate(d2[0].text.replace('\n','').split('|')) if i in (0,2)]
    d3 = bs.find('div',{'class':'ratings_wrapper'}).a.text
    t2 = [''.join(re.findall(r'\d',d3))]
    d4 = bs.find('div',{'class':'titleReviewBarItem titleReviewbarItemBorder'})
    d4 = d4.span.text.replace('\n','').split('|')
    t3 = [''.join(re.findall(r'\d',d)) for d in d4]
    d5 = bs.find('div',{'class':'plot_summary'}).find_all('div')
    summary = [x.text.strip().replace('\n','') for x in d5]
    t4 = []
    for i,n in enumerate(summary):
        if i == 1:
            names = re.findall(r'Dire.*:(.*)',n)[0].split(',')
            res = [i.split('(')[0].strip() for i in names]
            n = ', '.join(res)
        elif i == 2:
            names = re.findall(r'Write.*:(.*)',n)[0].split(',')
            res = [i.split('(')[0].strip() for i in names]
            if len(res) > 1:
                temp = re.findall(r'(.*)\|',res[1])
                if temp: res[1] = temp[0].strip()
            n = ', '.join(res)
        elif i == 3:
            stars = re.findall(r'Stars:(.*)\|See',n)
            res = [s.strip() for s in stars[0].split(',')]
            n = ', '.join(res)
        t4.append(n)
    return t1 + t2 + t3 + t4

#### Get the data of main page

In [6]:
bs = getSoup(web,header)
data = getData(bs)
data.head()

Unnamed: 0,Name,Score,Year,Url
0,The Shawshank Redemption,9.2,1994,https://www.imdb.com//title/tt0111161/
1,The Godfather,9.1,1972,https://www.imdb.com//title/tt0068646/
2,The Godfather: Part II,9.0,1974,https://www.imdb.com//title/tt0071562/
3,The Dark Knight,9.0,2008,https://www.imdb.com//title/tt0468569/
4,12 Angry Men,8.9,1957,https://www.imdb.com//title/tt0050083/


In [216]:
new_url = data['Url'].values[0]
print(new_url)
data['Url'][:5].values

https://www.imdb.com//title/tt0111161/


array(['https://www.imdb.com//title/tt0111161/',
       'https://www.imdb.com//title/tt0068646/',
       'https://www.imdb.com//title/tt0071562/',
       'https://www.imdb.com//title/tt0468569/',
       'https://www.imdb.com//title/tt0050083/'], dtype=object)

#### Save the data as 'test.csv'

In [218]:
columns = ['Name','Country','Language','Budget','Gross_revenue','Production','Runtime','Rating',
          'Genre','Nums_score','Nums_review','Nums_critic','Story','Director','Writer','Star']

with open("test.csv","w",newline='') as file: 
    writer = csv.writer(file)
    # columns_name
    writer.writerow(columns)
    for i,u in enumerate(data['Url'][:].values):
        page = getSoup(u,header)
        detail = getInfo_1(page) + getInfo_2(page)
        content = [data['Name'][i]] + detail
        writer.writerow(content)

- Check the test.csv and manually change two odd numbers, then save as data.csv

In [226]:
data2 = pd.read_csv('data.csv',encoding='latin-1')
data2.head()

Unnamed: 0,Name,Country1,Country2,Country3,Country4,Country5,Country6,Language1,Language2,Language3,...,Nums_review,Nums_critic,Story,Director1,Director2,Writer1,Writer2,Star1,Star2,Star3
0,The Shawshank Redemption,USA,,,,,,English,,,...,7738,225,Two imprisoned men bond over a number of years...,Frank Darabont,,Stephen King,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton
1,The Godfather,USA,,,,,,English,Italian,Latin,...,3726,248,The aging patriarch of an organized crime dyna...,Francis Ford Coppola,,Mario Puzo,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan
2,The Godfather: Part II,USA,,,,,,English,Italian,Spanish,...,964,176,The early life and career of Vito Corleone in ...,Francis Ford Coppola,,Francis Ford Coppola,Mario Puzo,Al Pacino,Robert De Niro,Robert Duvall
3,The Dark Knight,USA,UK,,,,,English,Mandarin,,...,6697,421,When the menace known as the Joker wreaks havo...,Christopher Nolan,,Jonathan Nolan,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart
4,12 Angry Men,USA,,,,,,English,,,...,1511,140,A jury holdout attempts to prevent a miscarria...,Sidney Lumet,,Reginald Rose,Reginald Rose,Henry Fonda,Lee J. Cobb,Martin Balsam


#### Form a new dataset by merge data and data2

In [227]:
result = pd.merge(data,data2,how='left',on=['Name'])
result.head()

Unnamed: 0,Name,Score,Year,Url,Country1,Country2,Country3,Country4,Country5,Country6,...,Nums_review,Nums_critic,Story,Director1,Director2,Writer1,Writer2,Star1,Star2,Star3
0,The Shawshank Redemption,9.2,1994,https://www.imdb.com//title/tt0111161/,USA,,,,,,...,7738,225,Two imprisoned men bond over a number of years...,Frank Darabont,,Stephen King,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton
1,The Godfather,9.1,1972,https://www.imdb.com//title/tt0068646/,USA,,,,,,...,3726,248,The aging patriarch of an organized crime dyna...,Francis Ford Coppola,,Mario Puzo,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan
2,The Godfather: Part II,9.0,1974,https://www.imdb.com//title/tt0071562/,USA,,,,,,...,964,176,The early life and career of Vito Corleone in ...,Francis Ford Coppola,,Francis Ford Coppola,Mario Puzo,Al Pacino,Robert De Niro,Robert Duvall
3,The Dark Knight,9.0,2008,https://www.imdb.com//title/tt0468569/,USA,UK,,,,,...,6697,421,When the menace known as the Joker wreaks havo...,Christopher Nolan,,Jonathan Nolan,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart
4,12 Angry Men,8.9,1957,https://www.imdb.com//title/tt0050083/,USA,,,,,,...,1511,140,A jury holdout attempts to prevent a miscarria...,Sidney Lumet,,Reginald Rose,Reginald Rose,Henry Fonda,Lee J. Cobb,Martin Balsam


#### Save new dataset as movie.csv for Tableau visualization

In [228]:
result.to_csv('movie.csv')

In [230]:
result.describe()

Unnamed: 0,Gross_revenue,Runtime,Nums_score,Nums_review,Nums_critic
count,250.0,250.0,250.0,250.0,250.0
mean,190173600.0,129.82,518900.5,1059.748,233.24
std,333340600.0,32.410327,443265.5,1258.530901,154.014389
min,0.0,45.0,25259.0,53.0,5.0
25%,2229591.0,108.0,155218.5,349.0,130.0
50%,46546570.0,126.0,373298.5,682.0,184.0
75%,243644000.0,145.75,773121.2,1242.75,290.25
max,2797801000.0,321.0,2236965.0,10004.0,842.0


In [248]:
for i,n in enumerate(list(result.columns)):
    print(i,n)

0 Name
1 Score
2 Year
3 Url
4 Country1
5 Country2
6 Country3
7 Country4
8 Country5
9 Country6
10 Language1
11 Language2
12 Language3
13 Language4
14 Language5
15 Language6
16 Gross_revenue
17 Production1
18 Production2
19 Production3
20 Runtime
21 Rating
22 Genre1
23 Genre2
24 Genre3
25 Nums_score
26 Nums_review
27 Nums_critic
28 Story
29 Director1
30 Director2
31 Writer1
32 Writer2
33 Star1
34 Star2
35 Star3


#### Fill Nan values as zero

In [241]:
train = result.fillna(0)

#### Choose all text features and merge them

In [256]:
index = list(range(4,16)) + list(range(17,20)) + list(range(21,25)) + list(range(28,36))
index = set(index)
print(index)

{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 21, 22, 23, 24, 28, 29, 30, 31, 32, 33, 34, 35}


In [262]:
temp = []
for i in range(250):
    info = [m for m in list(train.iloc[i,:])]
    infos = [m.strip() for i,m in enumerate(info) if i in index and m]
    text = ' '.join(infos)
    temp.append(text)

In [263]:
len(temp)

250

In [275]:
df1 = train[['Name']]
df1['Text'] = temp
df1['Text'].apply(lambda x:x.replace(',',''))
df1['Text'].apply(lambda x:x.replace('.',''))
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Name,Text
0,The Shawshank Redemption,USA English Castle Rock Entertainment R Drama ...
1,The Godfather,USA English Italian Latin Paramount Pictures A...
2,The Godfather: Part II,USA English Italian Spanish Latin Sicilian Par...
3,The Dark Knight,USA UK English Mandarin Warner Bros. Legendary...
4,12 Angry Men,USA English Orion-Nova Productions Approved Cr...


- One movie could be represent by one sentence

In [279]:
for test in df1['Text'][0:5]:
    print(test,'\n')

USA English Castle Rock Entertainment R Drama Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency. Frank Darabont Stephen King Frank Darabont Tim Robbins Morgan Freeman Bob Gunton 

USA English Italian Latin Paramount Pictures Alfran Productions R Crime Drama The aging patriarch of an organized crime dynasty transfers control of his clandestine empire to his reluctant son. Francis Ford Coppola Mario Puzo Francis Ford Coppola Marlon Brando Al Pacino James Caan 

USA English Italian Spanish Latin Sicilian Paramount Pictures The Coppola Company American Zoetrope R Crime Drama The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Michael, expands and tightens his grip on the family crime syndicate. Francis Ford Coppola Francis Ford Coppola Mario Puzo Al Pacino Robert De Niro Robert Duvall 

USA UK English Mandarin Warner Bros. Legendary Entertainment Syncopy PG-13 Action Crime Drama W

### Build Recommendation System

#### Load libraries

In [283]:
from gensim.models import Word2Vec
import multiprocessing

#### Prepare corpus and build Word2vec model

In [296]:
corpus = [t.split(' ') for t in temp]
print(corpus)

[['USA', 'English', 'Castle', 'Rock', 'Entertainment', 'R', 'Drama', 'Two', 'imprisoned', 'men', 'bond', 'over', 'a', 'number', 'of', 'years,', 'finding', 'solace', 'and', 'eventual', 'redemption', 'through', 'acts', 'of', 'common', 'decency.', 'Frank', 'Darabont', 'Stephen', 'King', 'Frank', 'Darabont', 'Tim', 'Robbins', 'Morgan', 'Freeman', 'Bob', 'Gunton'], ['USA', 'English', 'Italian', 'Latin', 'Paramount', 'Pictures', 'Alfran', 'Productions', 'R', 'Crime', 'Drama', 'The', 'aging', 'patriarch', 'of', 'an', 'organized', 'crime', 'dynasty', 'transfers', 'control', 'of', 'his', 'clandestine', 'empire', 'to', 'his', 'reluctant', 'son.', 'Francis', 'Ford', 'Coppola', 'Mario', 'Puzo', 'Francis', 'Ford', 'Coppola', 'Marlon', 'Brando', 'Al', 'Pacino', 'James', 'Caan'], ['USA', 'English', 'Italian', 'Spanish', 'Latin', 'Sicilian', 'Paramount', 'Pictures', 'The', 'Coppola', 'Company', 'American', 'Zoetrope', 'R', 'Crime', 'Drama', 'The', 'early', 'life', 'and', 'career', 'of', 'Vito', 'Corle

In [297]:
model = Word2Vec(corpus, size=50, window=5, min_count=1, workers=multiprocessing.cpu_count(),iter=10)

  "C extension not loaded, training will be slow. "


#### Word2Vec model's method

- word vectors

In [301]:
model.wv['Nolan']

array([-0.06258275,  0.04238755, -0.01123265,  0.18025228, -0.02378378,
        0.01619811, -0.14295757, -0.03961238,  0.00078774,  0.0052074 ,
       -0.04341641,  0.09207518,  0.02324331, -0.01271566,  0.11685536,
       -0.03095049, -0.0825461 ,  0.04561036, -0.22359745, -0.18868926,
        0.07883931,  0.12923737, -0.03889457, -0.17834055, -0.19388248,
        0.16701464,  0.07879965,  0.03129641, -0.14563966,  0.07747281,
        0.08989087, -0.07582206, -0.18439564,  0.06183587, -0.14444388,
        0.12470742,  0.10156769,  0.07324093, -0.10319933,  0.18476109,
        0.01217167,  0.0323372 , -0.05350762,  0.07793166, -0.0384212 ,
       -0.00196554,  0.05191206,  0.07672918,  0.21816877,  0.00832309],
      dtype=float32)

- The similarity between two words

In [302]:
model.similarity('Nolan', 'man')

  """Entry point for launching an IPython kernel.


0.9940369

- The most similar ten words of the certain word

In [304]:
model.most_similar('Nolan')

  """Entry point for launching an IPython kernel.


[('Christopher', 0.9963294267654419),
 ('a', 0.9960401058197021),
 ('R', 0.9958541393280029),
 ('his', 0.9956852793693542),
 ('from', 0.9956228733062744),
 ('to', 0.9955018758773804),
 ('an', 0.9953747391700745),
 ('Action', 0.9952875971794128),
 ('and', 0.9951527118682861),
 ('Crime', 0.9951463937759399)]

- The similarity between two sentence

In [311]:
model.n_similarity(corpus[20],corpus[1])

  """Entry point for launching an IPython kernel.


0.9999112

#### Create similarity matrix

In [312]:
ma = [[0]*250 for _ in range(250)]
for i in range(250):
    for j in range(250):
        ma[i][j] = model.n_similarity(corpus[i],corpus[j])

  after removing the cwd from sys.path.


In [305]:
import numpy as np

In [317]:
np.matrix(ma)
simi_matrix = pd.DataFrame(np.matrix(ma))

In [318]:
simi_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,1.0,0.999895,0.999905,0.999899,0.999874,0.999918,0.999893,0.999885,0.999839,0.999893,...,0.999862,0.999897,0.999873,0.999858,0.999892,0.999895,0.999789,0.999848,0.999838,0.99978
1,0.999895,1.0,0.999959,0.999903,0.999935,0.999907,0.999906,0.999928,0.999876,0.999917,...,0.99991,0.99992,0.999903,0.999871,0.999918,0.99993,0.999837,0.999887,0.999898,0.999808
2,0.999905,0.999959,1.0,0.999923,0.999915,0.999914,0.999947,0.99994,0.99988,0.999944,...,0.999922,0.999933,0.999927,0.999904,0.99992,0.99995,0.999811,0.999877,0.999897,0.999792
3,0.999899,0.999903,0.999923,1.0,0.999906,0.99989,0.999897,0.999919,0.999897,0.999914,...,0.999934,0.999934,0.999917,0.999877,0.999892,0.999937,0.999787,0.999875,0.999871,0.999788
4,0.999874,0.999935,0.999915,0.999906,1.0,0.999897,0.999892,0.999935,0.999892,0.999908,...,0.999926,0.999937,0.999911,0.999873,0.999922,0.99992,0.999844,0.999857,0.999882,0.999803


#### Recommend the most similar movie

In [331]:
def recommend(name,nums=8):
    names = list(data['Name'].values)
    index = names.index(name)
    vals = list(simi_matrix[index].values)
    value = sorted(vals,reverse=True)
    res = []
    for i in range(1,nums+1):
        name_index = vals.index(value[i])
        res.append(names[name_index])
    return res

In [333]:
recommend('The Godfather')

['The Godfather: Part II',
 'The Thing',
 'Ford v Ferrari',
 'V for Vendetta',
 'Taxi Driver',
 'In the Name of the Father',
 'The Pianist',
 'Psycho']

In [334]:
recommend('Inception')

['The Dark Knight',
 'The Dark Knight Rises',
 'Mad Max: Fury Road',
 'Catch Me If You Can',
 'The Prestige',
 'Ford v Ferrari',
 'The Wolf of Wall Street',
 'The Elephant Man']

In [335]:
recommend('The Lord of the Rings: The Return of the King')

['The Lord of the Rings: The Two Towers',
 'The Lord of the Rings: The Fellowship of the Ring',
 'Léon: The Professional',
 'The Godfather: Part II',
 'Prisoners',
 'Toy Story 3',
 'The Green Mile',
 'Amadeus']

### Score prediction

#### Prepare TF-IDF features

In [340]:
from sklearn.feature_extraction.text import TfidfVectorizer

TV = TfidfVectorizer(max_features=100)
X = TV.fit_transform(temp)

In [341]:
X

<250x100 sparse matrix of type '<class 'numpy.float64'>'
	with 3915 stored elements in Compressed Sparse Row format>

#### Create new train data for machine learning model

In [346]:
train2 = pd.DataFrame.sparse.from_spmatrix(X)
train2['Name'] = train['Name'].values
train2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.265385,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,The Shawshank Redemption
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.203629,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,The Godfather
2,0.0,0.0,0.0,0.0,0.0,0.0,0.244592,0.0,0.197571,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,The Godfather: Part II
3,0.154879,0.139394,0.0,0.0,0.0,0.0,0.0,0.0,0.155623,0.0,...,0.0,0.0,0.15656,0.160102,0.0,0.0,0.0,0.0,0.0,The Dark Knight
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12 Angry Men


In [347]:
train.columns

Index(['Name', 'Score', 'Year', 'Url', 'Country1', 'Country2', 'Country3',
       'Country4', 'Country5', 'Country6', 'Language1', 'Language2',
       'Language3', 'Language4', 'Language5', 'Language6', 'Gross_revenue',
       'Production1', 'Production2', 'Production3', 'Runtime', 'Rating',
       'Genre1', 'Genre2', 'Genre3', 'Nums_score', 'Nums_review',
       'Nums_critic', 'Story', 'Director1', 'Director2', 'Writer1', 'Writer2',
       'Star1', 'Star2', 'Star3'],
      dtype='object')

In [367]:
train3 = train[['Name','Score','Year','Gross_revenue','Runtime','Nums_score','Nums_review','Nums_critic']]
train3['Score'] = train3['Score'].apply(lambda x:float(str(x)))
train3['Year'] = train3['Year'].apply(lambda x:int(str(x)))
train3['Gross_revenue'] = train3['Gross_revenue'].apply(lambda x:int(str(x)))
train3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Name,Score,Year,Gross_revenue,Runtime,Nums_score,Nums_review,Nums_critic
0,The Shawshank Redemption,9.2,1994,28786657,142,2236965,7738,225
1,The Godfather,9.1,1972,246120974,175,1542452,3726,248
2,The Godfather: Part II,9.0,1974,48035783,202,1078827,964,176
3,The Dark Knight,9.0,2008,1004934033,152,2207569,6697,421
4,12 Angry Men,8.9,1957,576,96,654121,1511,140


In [369]:
train3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250 entries, 0 to 249
Data columns (total 8 columns):
Name             250 non-null object
Score            250 non-null float64
Year             250 non-null int64
Gross_revenue    250 non-null int64
Runtime          250 non-null int64
Nums_score       250 non-null int64
Nums_review      250 non-null int64
Nums_critic      250 non-null int64
dtypes: float64(1), int64(6), object(1)
memory usage: 27.6+ KB


- Look the correaltion of numerical variables

In [377]:
train3.corr()

Unnamed: 0,Score,Year,Gross_revenue,Runtime,Nums_score,Nums_review,Nums_critic
Score,1.0,0.003983,0.231971,0.185795,0.654957,0.524692,0.105675
Year,0.003983,1.0,0.383626,0.166454,0.364457,0.313838,0.505418
Gross_revenue,0.231971,0.383626,1.0,0.12689,0.501424,0.625136,0.520099
Runtime,0.185795,0.166454,0.12689,1.0,0.163108,0.190406,0.068576
Nums_score,0.654957,0.364457,0.501424,0.163108,1.0,0.699194,0.467373
Nums_review,0.524692,0.313838,0.625136,0.190406,0.699194,1.0,0.482882
Nums_critic,0.105675,0.505418,0.520099,0.068576,0.467373,0.482882,1.0


In [370]:
train_data = pd.merge(left=train3,right=train2,on=['Name'])
train_data.head()

Unnamed: 0,Name,Score,Year,Gross_revenue,Runtime,Nums_score,Nums_review,Nums_critic,0,1,...,90,91,92,93,94,95,96,97,98,99
0,The Shawshank Redemption,9.2,1994,28786657,142,2236965,7738,225,0.0,0.0,...,0.232978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,The Godfather,9.1,1972,246120974,175,1542452,3726,248,0.0,0.0,...,0.123231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,The Godfather: Part II,9.0,1974,48035783,202,1078827,964,176,0.0,0.0,...,0.086723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,The Dark Knight,9.0,2008,1004934033,152,2207569,6697,421,0.154879,0.139394,...,0.06831,0.0,0.0,0.15656,0.160102,0.0,0.0,0.0,0.0,0.0
4,12 Angry Men,8.9,1957,576,96,654121,1511,140,0.0,0.0,...,0.214965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Split X, y and randomly split them to trainset, testset

In [351]:
cols = [col for col in list(train_data.columns) if col not in ('Name','Score')]

In [372]:
from sklearn.model_selection import ShuffleSplit
X = train_data[cols].values
y = train_data['Score'].values
rs = ShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
rs.get_n_splits(X)

1

In [376]:
for train_index, test_index in rs.split(X):
    print("TRAIN:", train_index) 
    print("TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [106 243   4 219  61 124  52  66  26 109 241 222  40  13   3 244  24  30
 223  60  56 137 129  19 187 134  54 200  80  51   2 249 104 184  86  10
 168 178 233  41  14  27  50 226  20 206  46 171 245 123 145 215  62 138
 190 153 135  43 196 144 159 112 182  98 158  93 149 239 113   0  94  95
  69  49  48  85 247 141  23 220 143  78 100 131 201 248   6  68  84 121
 234 189 212 207 191  91 213  11 119 102  35  57 169  65   1 120 199  42
 105 132 236  17  38 133  53 164 214 128  34  28 183 114 163 151  31 205
 127 185 221 232  32 167 142 209 147  29 177  99  82 246 175  79 197 208
 115 148 242  72  77  25 165  81 235 174 238  39 230 193  58 140  88 216
  70  87  36  21 211   9 103 195  67 192 117  47 172]
TEST: [225 122  92 157 154 161 198  83  63 155 218 231 108 186 116  73 203 139
 152  96 156  45 237 111 150  90   8  55 194  37 204  76 110 173 166 136
 130   5  22 126 118  12 107 176  89  97 162  44  64 179  71 160 180  75
  59 101 229  18 188  15 217  74   7 227 181 228  33 202 

#### Build Linear Regression model

In [380]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

- Find the mean squared error

In [386]:
from sklearn.metrics import mean_squared_error
score_predictions = lin_reg.predict(X_train)
lin_mse = mean_squared_error(y_train, score_predictions)
print('Train set: ',lin_mse)
score_predictions = lin_reg.predict(X_test)
lin_mse = mean_squared_error(y_test, score_predictions)
print('Test set: ',lin_mse)

Train set:  0.0076799561619233
Test set:  0.06572300299669807


- Find the weights of parameters and R square

In [409]:
temp = []
for w,c in zip(list(lin_reg.coef_),cols):
    temp.append([w,c])
temp.sort(key=lambda x:abs(x[0]),reverse=True)

In [410]:
temp[:10]

[[-0.8635054618888202, 25],
 [-0.7757719506433561, 2],
 [-0.7497754718535969, 14],
 [0.640416302132077, 79],
 [0.6055665789989417, 98],
 [0.5682800395808939, 50],
 [0.48733333985938065, 20],
 [-0.41506252020553475, 97],
 [0.4011418465266935, 16],
 [0.35903749112156663, 76]]

In [421]:
lin_reg.score(X_train, y_train)

0.8624054280789881

In [422]:
lin_reg.score(X_test, y_test)

-0.4983054707644734

#### Build Random Forest Regression model

In [387]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

- Find the mean squared error

In [411]:
score_predictions = forest_reg.predict(X_train)
rf_mse = mean_squared_error(y_train, score_predictions)
print('Train set: ',rf_mse)
score_predictions = forest_reg.predict(X_test)
rf_mse = mean_squared_error(y_test, score_predictions)
print('Test set: ',rf_mse)

Train set:  0.0039638685714288215
Test set:  0.025947600000000157


- Find the important features

In [419]:
forest_reg.feature_importances_

array([2.70435199e-02, 1.90088355e-02, 1.95372113e-02, 4.62367455e-01,
       1.25743143e-01, 2.99253405e-02, 1.52874584e-03, 2.36673124e-03,
       1.29341797e-03, 1.20341055e-03, 1.67061998e-04, 2.59964492e-03,
       5.74071849e-04, 1.52480572e-03, 2.94805295e-02, 9.25217204e-04,
       1.36632197e-03, 1.51302572e-03, 3.04555338e-04, 4.70226933e-04,
       5.37030427e-04, 1.53731478e-03, 1.36489332e-02, 1.10388564e-03,
       1.99129693e-02, 2.70581167e-04, 1.43794730e-03, 1.98996859e-03,
       7.50279220e-03, 1.12148471e-03, 6.31285731e-03, 1.32032530e-02,
       3.86908484e-03, 2.28605068e-03, 3.10886148e-03, 8.25692397e-04,
       1.15726846e-03, 4.21997145e-03, 2.98909798e-04, 2.40755267e-03,
       5.29128084e-03, 1.39728767e-03, 8.85604180e-03, 5.79588687e-04,
       2.85094007e-03, 1.38801962e-03, 4.36526159e-03, 7.48464011e-03,
       4.28762227e-04, 1.08036717e-02, 8.30725759e-04, 8.78248127e-03,
       4.81333681e-04, 1.01734452e-03, 1.84536789e-03, 4.34598047e-04,
      

In [416]:
temp = []
for w,c in zip(list(forest_reg.feature_importances_),cols):
    temp.append([w,c])
temp.sort(key=lambda x:x[0],reverse=True)

In [418]:
temp[:10]

[[0.4623674553509341, 'Nums_score'],
 [0.12574314294345068, 'Nums_review'],
 [0.029925340511104744, 'Nums_critic'],
 [0.029480529516424877, 8],
 [0.027043519882957147, 'Year'],
 [0.019962412030952007, 81],
 [0.019912969271263656, 18],
 [0.01953721131329042, 'Runtime'],
 [0.019008835517725434, 'Gross_revenue'],
 [0.015063214426085415, 86]]

#### Comparison of two model

In [426]:
pred_data = data[['Name']]
pred_data['Score'] = y
pred_data['LR_Predict'] = lin_reg.predict(X)
pred_data['RF_Predict'] = forest_reg.predict(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [432]:
pred_data.head(15)

Unnamed: 0,Name,Score,LR_Predict,RF_Predict
0,The Shawshank Redemption,9.2,9.211239,9.084
1,The Godfather,9.1,9.113875,8.967
2,The Godfather: Part II,9.0,8.766778,8.797
3,The Dark Knight,9.0,8.972794,8.882
4,12 Angry Men,8.9,8.639945,8.61
5,Schindler's List,8.9,8.80817,8.509
6,The Lord of the Rings: The Return of the King,8.9,8.918161,8.87
7,Pulp Fiction,8.8,8.838876,8.942
8,"The Good, the Bad and the Ugly",8.8,8.675652,8.317
9,The Lord of the Rings: The Fellowship of the Ring,8.8,8.842794,8.845


In [430]:
mean_squared_error(y, lin_reg.predict(X))

0.025092870212355737

In [431]:
mean_squared_error(y, forest_reg.predict(X))

0.010558988000000224