In [23]:
from requests import get
import pandas as pd
from bs4 import BeautifulSoup as bs
from fuzzywuzzy import process,fuzz
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib
from sklearn.metrics import roc_auc_score,mean_absolute_error,r2_score

data= get("https://archive.ics.uci.edu/ml/machine-learning-databases/movies-mld/data/main.html")

# Kaggle DataSet ="https://www.kaggle.com/PromptCloudHQ/imdb-data/data"
kaggle= pd.read_csv("IMDB-Movie-Data.csv")

In [4]:
soup= bs(data.content,"lxml")
table = soup.findAll('table')
movieslist=[]
for i in table:
    x=i.findAll('tr')
    for y in x:
        try:
            movieslist.append( str( y.findAll('td')[1] ).split(":")[1].split("<")[0] )
        except:
            pass
#print(movieslist)

In [6]:
#Kaggle Data Feature Preparation
mean_Revenue=kaggle["Revenue (Millions)"][kaggle["Revenue (Millions)"].notnull()].mean()
kaggle["Revenue (Millions)"].fillna(mean_Revenue,inplace=True)

mean_Metascore = kaggle["Metascore"][kaggle["Metascore"].notnull()].mean()
kaggle["Metascore"].fillna(mean_Metascore,inplace=True)

features=["Votes","Revenue (Millions)","Metascore","Year","Runtime (Minutes)"]
XX=kaggle[features]

y="Rating"
Y=kaggle["Rating"]
movies = kaggle["Title"]

scaler = MinMaxScaler()
scaler.fit(kaggle[["Votes","Revenue (Millions)","Metascore"]])
X=pd.DataFrame(scaler.transform(kaggle[["Votes","Revenue (Millions)","Metascore"]]))
X["Year"],X["Runtime (Minutes)"]=kaggle["Year"],kaggle["Runtime (Minutes)"]
#print(X.isnull().values.any())
X.columns =(features)
print(X.head())

      Votes  Revenue (Millions)  Metascore  Year  Runtime (Minutes)
0  0.422474            0.355669   0.730337  2014                121
1  0.271093            0.135016   0.606742  2012                124
2  0.087923            0.147465   0.573034  2016                117
3  0.033755            0.288609   0.539326  2016                108
4  0.219697            0.347010   0.325843  2016                123


In [7]:
#svm Model
from sklearn import svm
clf=svm.SVR()
clf.fit(X,Y)
predicted=clf.predict(X)
joblib.dump(clf,"svm.pkl")
clf = joblib.load('svm.pkl') 
predicted=clf.predict(X)
print(X.shape)
print((abs(predicted)-abs(Y) ).mean())

(1000, 5)
0.07460245585167598


In [8]:
# LassoLars
from sklearn import linear_model
reg = linear_model.LassoLars(alpha=0,normalize=False)
(reg.fit(X,Y))
joblib.dump(reg,"lasso.pkl")
clf = joblib.load('lasso.pkl') 
predicted=clf.predict(X)
print(clf.coef_)
print((abs(predicted)-abs(Y)).mean())

[  3.35944847e+00  -1.33687549e+00   2.36652642e+00  -2.62357252e-03
   9.03804021e-03]
-2.177813485104707e-15


In [9]:
def movie_rating_SVM(movieName="Step UP"):
    i=process.extractOne(movieName,movies ,scorer=fuzz.token_sort_ratio)
    Xtest=kaggle.iloc[i[2]][features]
    clf = joblib.load('svm.pkl')
    Ypredicted=clf.predict(Xtest.values.reshape(1,5))
    Ytrue=kaggle.iloc[i[2]][y]
    print("SVM",{"mean_absolute_error":mean_absolute_error([Ytrue],Ypredicted),
          "r2_scor": r2_score([Ytrue],Ypredicted)})
    return("%.2f/%.2f "  %(Ypredicted[0] ,Ytrue))


In [10]:
def movie_rating_Lasso(movieName="Step UP"):
    i=process.extractOne(movieName,movies ,scorer=fuzz.token_sort_ratio)
    Xtest=X.iloc[i[2]][features]
    clf = joblib.load('lasso.pkl')
    Ypredicted=clf.predict(Xtest.values.reshape(1,5))
    Ytrue= kaggle.iloc[i[2]][y]
    print("Lasso",{"mean_absolute_error":mean_absolute_error([Ytrue],Ypredicted),
          "r2_scor": r2_score([Ytrue],Ypredicted)})
    return("%.2f/%.2f "  %(Ypredicted[0] ,Ytrue))


In [11]:
# User ->To Enter the Movie Name as Argument
# Example -> movie_rating_SVM("Step UP")
print("SVM:",movie_rating_SVM())
print("Lasso:",movie_rating_Lasso())

SVM {'mean_absolute_error': 0.57526194020021215, 'r2_scor': 0.0}
SVM: 7.08/6.50 
Lasso {'mean_absolute_error': 0.24720526981114688, 'r2_scor': 0.0}
Lasso: 6.25/6.50 


Interpretation => On removing last two features while training SVM model, it will give better accuracy.

In [24]:
#from sklearn.model_selection import GridSearchCV
#clf=svm.SVR()
#params =dict(kernal=['linear', 'rbf', 'sigmoid'],)
#grid = GridSearchCV(clf,params,cv=10,scoring="accuracy",n_jobs=-1)
#grid.fit(X,Y)

