### IMBD TOP250 Movie Analysis and Recommendation

#### Load libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import csv
import time
import random

#### Prepare useful information

In [None]:
web = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'
web2 = 'https://www.imdb.com/'
headers = [
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"]

header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}

pattern1 = 'Country:(.*) Language:(.*) Rele.*Budget:(.*)\(.*Gro.*:(.*)Pro.*Co:(.*)See more.*Runtime:(.*)min'
pattern2 = 'Country:(.*) Language:(.*) Rele.*Gro.*:(.*)Pro.*Co:(.*)See more.*Runtime:(.*)min'
pattern3 = 'Country:(.*) Language:(.*) Rele.*Budget:(.*)\(.*Pro.*Co:(.*)See more.*Runtime:(.*)min'
pattern4 = 'Country:(.*) Language:(.*) Rele.*Gro.*:(.*)Pro.*Co:(.*)See more.*Runtime:(.*)min'

#### Function
- Parse the main page of IMBD

In [None]:
# Get soup object
def getSoup(url,header):
    req = requests.get(url,headers=header)
    bs = BeautifulSoup(req.content,'html')
    return bs

# get data
def getData(bs):
    data = bs.find('tbody', attrs = {'class': 'lister-list'})
    # get titles and urls
    film_info = data.find_all('td',attrs={'class':'titleColumn'})
    titles =  [k.find('a').text for k in film_info]
    urls =  [web2 + k.find('a').attrs['href'][:17] for k in film_info]
    years = [k.find('span').text[1:5] for k in film_info]
    # get score
    score_info = data.find_all('td',attrs={'class':'imdbRating'})
    scores = [k.text.replace('\n','') for k in score_info]
    # return data frame
    data = pd.DataFrame({'Name':titles,'Score':scores,'Year':years,'Url':urls})      
    return data

- Parse the details of movie

In [None]:
# ['USA', 'English', '25000000', '28786657', 'Castle Rock Entertainment', '142']
# country, language, budget, gross revenue, production, runtime
def getInfo_1(bs,p = pattern1):
    info1 = bs.find('div',{'class':'article','id':'titleDetails'}).find_all('div',attrs={'class':'txt-block'})
    info2 = ' '.join([s.text.replace('\n','').strip() for s in info1])
    info3 = re.findall(p,info2)
    status = 0
    if not info3: 
        info3, status = re.findall(pattern2,info2), 1
    if not info3: 
        info3, status = re.findall(pattern3,info2), 2
    d1 = [d.strip() for d in info3[0]]
    if len(d1) < 6:
        if status == 1:
            d1.insert(2,'0')
        elif status == 2:
            d1.insert(3,'0')
    temp = []
    for i,n in enumerate(d1):
        if i in (2,3):
            n = ''.join(re.findall(r'\d',n))
        elif i == 5:
            n = ''.join(re.findall(r'\d',n.split('|')[0]))
        temp.append(n)
    return temp

# ['R','Drama','2236434','7728', '225', 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
# ['Frank Darabont'],['Stephen King', 'Frank Darabont'],['Tim Robbins', 'Morgan Freeman', 'Bob Gunton']]
# Rating, Genera, Nums of Score, Nums of Review, Nums of critic, Story, Director, Writer, Stars
def getInfo_2(bs):
    d2 = bs.find('div',{'class':'title_wrapper'}).find_all('div',{'class':'subtext'})
    t1 = [info.strip() for i,info in enumerate(d2[0].text.replace('\n','').split('|')) if i in (0,2)]
    d3 = bs.find('div',{'class':'ratings_wrapper'}).a.text
    t2 = [''.join(re.findall(r'\d',d3))]
    d4 = bs.find('div',{'class':'titleReviewBarItem titleReviewbarItemBorder'})
    d4 = d4.span.text.replace('\n','').split('|')
    t3 = [''.join(re.findall(r'\d',d)) for d in d4]
    d5 = bs.find('div',{'class':'plot_summary'}).find_all('div')
    summary = [x.text.strip().replace('\n','') for x in d5]
    t4 = []
    for i,n in enumerate(summary):
        if i == 1:
            names = re.findall(r'Dire.*:(.*)',n)[0].split(',')
            res = [i.split('(')[0].strip() for i in names]
            n = ', '.join(res)
        elif i == 2:
            names = re.findall(r'Write.*:(.*)',n)[0].split(',')
            res = [i.split('(')[0].strip() for i in names]
            if len(res) > 1:
                temp = re.findall(r'(.*)\|',res[1])
                if temp: res[1] = temp[0].strip()
            n = ', '.join(res)
        elif i == 3:
            stars = re.findall(r'Stars:(.*)\|See',n)
            res = [s.strip() for s in stars[0].split(',')]
            n = ', '.join(res)
        t4.append(n)
    return t1 + t2 + t3 + t4

#### Get the data of main page

In [None]:
bs = getSoup(web,header)
data = getData(bs)
data.head()

In [None]:
new_url = data['Url'].values[0]
print(new_url)
data['Url'][:5].values

#### Save the data as 'test.csv'

In [None]:
columns = ['Name','Country','Language','Budget','Gross_revenue','Production','Runtime','Rating',
          'Genre','Nums_score','Nums_review','Nums_critic','Story','Director','Writer','Star']

with open("test.csv","w",newline='') as file: 
    writer = csv.writer(file)
    # columns_name
    writer.writerow(columns)
    for i,u in enumerate(data['Url'][:].values):
        page = getSoup(u,header)
        detail = getInfo_1(page) + getInfo_2(page)
        content = [data['Name'][i]] + detail
        writer.writerow(content)

- Check the test.csv and manually change two odd numbers, then save as data.csv

In [None]:
data2 = pd.read_csv('data.csv',encoding='latin-1')
data2.head()

#### Form a new dataset by merge data and data2

In [None]:
result = pd.merge(data,data2,how='left',on=['Name'])
result.head()

#### Save new dataset as movie.csv for Tableau visualization

In [None]:
result.to_csv('movie.csv')

In [None]:
result.describe()

In [None]:
for i,n in enumerate(list(result.columns)):
    print(i,n)

#### Fill Nan values as zero

In [None]:
train = result.fillna(0)

#### Choose all text features and merge them

In [None]:
index = list(range(4,16)) + list(range(17,20)) + list(range(21,25)) + list(range(28,36))
index = set(index)
print(index)

In [None]:
temp = []
for i in range(250):
    info = [m for m in list(train.iloc[i,:])]
    infos = [m.strip() for i,m in enumerate(info) if i in index and m]
    text = ' '.join(infos)
    temp.append(text)

In [None]:
len(temp)

In [None]:
df1 = train[['Name']]
df1['Text'] = temp
df1['Text'].apply(lambda x:x.replace(',',''))
df1['Text'].apply(lambda x:x.replace('.',''))
df1.head()

- One movie could be represent by one sentence

In [None]:
for test in df1['Text'][0:5]:
    print(test,'\n')

### Build Recommendation System

#### Load libraries

In [None]:
from gensim.models import Word2Vec
import multiprocessing

#### Prepare corpus and build Word2vec model

In [None]:
corpus = [t.split(' ') for t in temp]
print(corpus)

In [None]:
model = Word2Vec(corpus, size=50, window=5, min_count=1, workers=multiprocessing.cpu_count(),iter=10)

#### Word2Vec model's method

- word vectors

In [None]:
model.wv['Nolan']

- The similarity between two words

In [None]:
model.similarity('Nolan', 'man')

- The most similar ten words of the certain word

In [None]:
model.most_similar('Nolan')

- The similarity between two sentence

In [None]:
model.n_similarity(corpus[20],corpus[1])

#### Create similarity matrix

In [None]:
ma = [[0]*250 for _ in range(250)]
for i in range(250):
    for j in range(250):
        ma[i][j] = model.n_similarity(corpus[i],corpus[j])

In [None]:
import numpy as np

In [None]:
np.matrix(ma)
simi_matrix = pd.DataFrame(np.matrix(ma))

In [None]:
simi_matrix.head()

#### Recommend the most similar movie

In [None]:
def recommend(name,nums=8):
    names = list(data['Name'].values)
    index = names.index(name)
    vals = list(simi_matrix[index].values)
    value = sorted(vals,reverse=True)
    res = []
    for i in range(1,nums+1):
        name_index = vals.index(value[i])
        res.append(names[name_index])
    return res

In [None]:
recommend('The Godfather')

In [None]:
recommend('Inception')

In [None]:
recommend('The Lord of the Rings: The Return of the King')

### Score prediction

#### Prepare TF-IDF features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

TV = TfidfVectorizer(max_features=100)
X = TV.fit_transform(temp)

In [None]:
X

#### Create new train data for machine learning model

In [None]:
train2 = pd.DataFrame.sparse.from_spmatrix(X)
train2['Name'] = train['Name'].values
train2.head()

In [None]:
train.columns

In [None]:
train3 = train[['Name','Score','Year','Gross_revenue','Runtime','Nums_score','Nums_review','Nums_critic']]
train3['Score'] = train3['Score'].apply(lambda x:float(str(x)))
train3['Year'] = train3['Year'].apply(lambda x:int(str(x)))
train3['Gross_revenue'] = train3['Gross_revenue'].apply(lambda x:int(str(x)))
train3.head()

In [None]:
train3.info()

- Look the correaltion of numerical variables

In [None]:
train3.corr()

In [None]:
train_data = pd.merge(left=train3,right=train2,on=['Name'])
train_data.head()

#### Split X, y and randomly split them to trainset, testset

In [None]:
cols = [col for col in list(train_data.columns) if col not in ('Name','Score')]

In [None]:
from sklearn.model_selection import ShuffleSplit
X = train_data[cols].values
y = train_data['Score'].values
rs = ShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
rs.get_n_splits(X)

In [None]:
for train_index, test_index in rs.split(X):
    print("TRAIN:", train_index) 
    print("TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

#### Build Linear Regression model

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

- Find the mean squared error

In [None]:
from sklearn.metrics import mean_squared_error
score_predictions = lin_reg.predict(X_train)
lin_mse = mean_squared_error(y_train, score_predictions)
print('Train set: ',lin_mse)
score_predictions = lin_reg.predict(X_test)
lin_mse = mean_squared_error(y_test, score_predictions)
print('Test set: ',lin_mse)

- Find the weights of parameters and R square

In [None]:
temp = []
for w,c in zip(list(lin_reg.coef_),cols):
    temp.append([w,c])
temp.sort(key=lambda x:abs(x[0]),reverse=True)

In [None]:
temp[:10]

In [None]:
lin_reg.score(X_train, y_train)

In [None]:
lin_reg.score(X_test, y_test)

#### Build Random Forest Regression model

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)

- Find the mean squared error

In [None]:
score_predictions = forest_reg.predict(X_train)
rf_mse = mean_squared_error(y_train, score_predictions)
print('Train set: ',rf_mse)
score_predictions = forest_reg.predict(X_test)
rf_mse = mean_squared_error(y_test, score_predictions)
print('Test set: ',rf_mse)

- Find the important features

In [None]:
forest_reg.feature_importances_

In [None]:
temp = []
for w,c in zip(list(forest_reg.feature_importances_),cols):
    temp.append([w,c])
temp.sort(key=lambda x:x[0],reverse=True)

In [None]:
temp[:10]

#### Comparison of two model

In [None]:
pred_data = data[['Name']]
pred_data['Score'] = y
pred_data['LR_Predict'] = lin_reg.predict(X)
pred_data['RF_Predict'] = forest_reg.predict(X)

In [None]:
pred_data.head(15)

In [None]:
mean_squared_error(y, lin_reg.predict(X))

In [None]:
mean_squared_error(y, forest_reg.predict(X))