In [68]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Input, Concatenate
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [69]:
film = pd.read_csv('movies.csv')
film.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [70]:
film.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7668 entries, 0 to 7667
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7668 non-null   object 
 1   rating    7591 non-null   object 
 2   genre     7668 non-null   object 
 3   year      7668 non-null   int64  
 4   released  7666 non-null   object 
 5   score     7665 non-null   float64
 6   votes     7665 non-null   float64
 7   director  7668 non-null   object 
 8   writer    7665 non-null   object 
 9   star      7667 non-null   object 
 10  country   7665 non-null   object 
 11  budget    5497 non-null   float64
 12  gross     7479 non-null   float64
 13  company   7651 non-null   object 
 14  runtime   7664 non-null   float64
dtypes: float64(5), int64(1), object(9)
memory usage: 898.7+ KB


In [98]:
film = film.dropna()

In [99]:
sentences = film['name'].astype(str).tolist()

target = film['gross']

In [100]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1

In [101]:
tokenized_sentences = tokenizer.texts_to_sequences(sentences)
max_sequence_len = max([len(x) for x in tokenized_sentences])
padded_sequences = pad_sequences(tokenized_sentences, maxlen=max_sequence_len, padding='post')


In [102]:
print(f"Total kata: {total_words}")
print(f"Panjang urutan maksimum: {max_sequence_len}")
print("Contoh urutan token:", tokenized_sentences[:5])

Total kata: 4935
Panjang urutan maksimum: 14
Contoh urutan token: [[1, 1729], [1, 70, 1008], [28, 95, 212, 183, 1, 515, 682, 64], [1009], [1010]]


In [103]:
numerical_features = film[['budget', 'runtime']]  
scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(numerical_features)


In [120]:
print("numerik yang diskalakan:", scaled_numerical_features[:5])

numerik yang diskalakan: [[-0.4091739   2.08938966]
 [-0.757987   -0.22925545]
 [-0.43322997  0.87486127]
 [-0.78204308 -1.11254882]
 [-0.72190289 -0.56049046]]


In [105]:
X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(
    padded_sequences, scaled_numerical_features, target, test_size=0.2, random_state=42)


In [108]:
text_input = Input(shape=(max_sequence_len,), name='text_input')
embedding = Embedding(total_words, 64, input_length=max_sequence_len)(text_input)
rnn_output = SimpleRNN(64)(embedding)

In [109]:

numerical_input = Input(shape=(scaled_numerical_features.shape[1],), name='numerical_input')

In [110]:
combined = Concatenate()([rnn_output, numerical_input])

In [111]:
dense1 = Dense(64, activation='relu')(combined)
output = Dense(1, activation='linear')(dense1)

In [112]:
model = Model(inputs=[text_input, numerical_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error', metrics=['mean_absolute_error'])
model.summary()

In [113]:
history = model.fit(
    {'text_input': X_train_text, 'numerical_input': X_train_num},
    y_train,
    epochs=30,
    validation_split=0.2,
    verbose=1
)

Epoch 1/30
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - loss: 46772670594809856.0000 - mean_absolute_error: 102416400.0000 - val_loss: 35608538764017664.0000 - val_mean_absolute_error: 97456384.0000
Epoch 2/30
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 42499783725678592.0000 - mean_absolute_error: 100216536.0000 - val_loss: 35605678315798528.0000 - val_mean_absolute_error: 97442024.0000
Epoch 3/30
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 41799600977215488.0000 - mean_absolute_error: 103311208.0000 - val_loss: 35600743398375424.0000 - val_mean_absolute_error: 97417464.0000
Epoch 4/30
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 43920271144386560.0000 - mean_absolute_error: 103102992.0000 - val_loss: 35593789846323200.0000 - val_mean_absolute_error: 97382864.0000
Epoch 5/30
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s

In [114]:
y_pred = model.predict({'text_input': X_test_text, 'numerical_input': X_test_num})

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step


In [115]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')


Mean Absolute Error: 105387734.00910138
Mean Squared Error: 5.4001446923328296e+16


In [119]:
predictions = pd.DataFrame({'Prediksi': y_pred.flatten(), 'Sebenarnya': y_test})
print(predictions.head(10))

         Prediksi   Sebenarnya
1460  3678880.500   96759512.0
3309  3715499.000   93107289.0
878   3637246.000    7433663.0
1446  3621965.000   40508994.0
3664  3807868.500  546388108.0
303   1848352.000    8000000.0
7269  1854843.375   49562710.0
1255  3644433.000   12793213.0
3970  3665413.500   30919415.0
593   3638153.000   19265302.0
