In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from scipy.sparse import hstack, csr_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from joblib import dump, load

: 

In [None]:
df = pd.read_csv('movies.csv')
df = df.dropna()

: 

In [None]:
df

: 

In [None]:
swords = stopwords.words('english')

: 

In [None]:
def clean_text(sent):
    tokens = word_tokenize(sent)
    wnl = WordNetLemmatizer()
    clean = ' '.join([wnl.lemmatize(word) for word in tokens if word.isalpha()])
    return clean


: 

In [None]:
tfidf = TfidfVectorizer(analyzer=clean_text)

: 

In [None]:
text_columns = ['name', 'rating', 'genre', 'director', 'writer', 'star', 'country', 'company']

: 

In [None]:
X_text = tfidf.fit_transform(df[text_columns].apply(' '.join, axis=1))


: 

In [None]:
X_numerical = df[['score', 'votes', 'budget', 'runtime', 'year']]

: 

In [None]:
X_combined = hstack([X_text, csr_matrix(X_numerical)])

: 

In [None]:
df['gross'] = np.log1p(df['gross'])
y = df['gross']

: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.25, random_state=42)

: 

In [None]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

: 

In [None]:
y_pred = rf_regressor.predict(X_test)

: 

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

: 

In [None]:
dump(rf_regressor, 'movieModel.pkl')

: 

In [None]:
inputs = {
    'name': 'The Avengers',
    'rating': 'PG',
    'genre': 'Action',
    'director': 'John Weadon',
    'writer': 'Kedar Chikane',
    'star': 'Robert Downey Jr.',
    'country': 'United States Of America',
    'company': 'Marvel Studios',
    'score': 8.7,
    'votes': 900000,
    'budget': 300000000,
    'runtime': 150,
    'year': 2012
}

: 

In [None]:
input_text = ' '.join(clean_text(inputs[col]) for col in text_columns)  # Join tokens into a single string
input_text_transformed = tfidf.transform([input_text])

input_numerical = csr_matrix([[inputs[col] for col in X_numerical.columns]])

# Concatenate text and numerical features for prediction
input_features = hstack([input_text_transformed, input_numerical])

: 

In [None]:
prediction = rf_regressor.predict(input_features)
predicted_gross = np.expm1(prediction)
print("Predicted Gross Value:", predicted_gross)

: 

In [None]:
model = load('movieModel.pkl')

: 

In [None]:
predict = model.predict(input_features)
predicted_gross1 = np.expm1(predict)
print("Predicted Gross Value:", predicted_gross1)

: 

: 