In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from keras.models import Sequential
from keras.layers import Dense, Dropout
import time

# Timing function
def timeit(method):
    def timed(*args, **kwargs):
        start_time = time.time()
        result = method(*args, **kwargs)
        end_time = time.time()
        print(f"{method.__name__} took {end_time - start_time:.2f} seconds")
        return result
    return timed

# Define neural network model
def create_model(input_dim, optimizer='adam', dropout_rate=0.0, neurons=32):
    model = Sequential()
    model.add(Dense(neurons, input_dim=input_dim, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer=optimizer)
    return model

@timeit
def main():
    print("Loading data...")
    df = pd.read_csv('movies.csv')

    label = "Popularity"

    # Identify non-numeric columns
    non_numeric_columns = [col for col in df.columns if df[col].dtype == 'object']

    # Drop non-numeric columns
    df.drop(columns=non_numeric_columns, inplace=True)

    # Handle missing values
    df = df.fillna(df.mean(numeric_only=True))

    # Splitting data
    X = df.drop(columns=[label])
    y = df[label]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    input_dim = X_train.shape[1]

    # Neural Network Model
    nn_model = create_model(input_dim)
    nn_model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=1)
    nn_y_pred = nn_model.predict(X_test)
    nn_mse = mean_squared_error(y_test, nn_y_pred)
    nn_r2 = r2_score(y_test, nn_y_pred)

    # Linear Regression Model
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    lr_y_pred = lr_model.predict(X_test)
    lr_mse = mean_squared_error(y_test, lr_y_pred)
    lr_r2 = r2_score(y_test, lr_y_pred)
    lr_cv_mse = -cross_val_score(lr_model, X, y, cv=3, scoring='neg_mean_squared_error').mean()

    # Ridge Regression Model
    ridge_model = Ridge()
    ridge_model.fit(X_train, y_train)
    ridge_y_pred = ridge_model.predict(X_test)
    ridge_mse = mean_squared_error(y_test, ridge_y_pred)
    ridge_r2 = r2_score(y_test, ridge_y_pred)
    ridge_cv_mse = -cross_val_score(ridge_model, X, y, cv=3, scoring='neg_mean_squared_error').mean()

    # Random Forest Regressor Model
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train, y_train)
    rf_y_pred = rf_model.predict(X_test)
    rf_mse = mean_squared_error(y_test, rf_y_pred)
    rf_r2 = r2_score(y_test, rf_y_pred)
    rf_cv_mse = -cross_val_score(rf_model, X, y, cv=3, scoring='neg_mean_squared_error').mean()

    # Gradient Boosting Regressor Model
    gb_model = GradientBoostingRegressor()
    gb_model.fit(X_train, y_train)
    gb_y_pred = gb_model.predict(X_test)
    gb_mse = mean_squared_error(y_test, gb_y_pred)
    gb_r2 = r2_score(y_test, gb_y_pred)
    gb_cv_mse = -cross_val_score(gb_model, X, y, cv=3, scoring='neg_mean_squared_error').mean()

    # Display results
    print("\nNeural Network Model Performance:")
    print(f"Mean Squared Error: {nn_mse}")
    print(f"R^2 Score: {nn_r2}")

    print("\nLinear Regression Model Performance:")
    print(f"Mean Squared Error: {lr_mse}")
    print(f"R^2 Score: {lr_r2}")
    print(f"Cross-Validated MSE: {lr_cv_mse}")

    print("\nRidge Regression Model Performance:")
    print(f"Mean Squared Error: {ridge_mse}")
    print(f"R^2 Score: {ridge_r2}")
    print(f"Cross-Validated MSE: {ridge_cv_mse}")

    print("\nRandom Forest Regressor Model Performance:")
    print(f"Mean Squared Error: {rf_mse}")
    print(f"R^2 Score: {rf_r2}")
    print(f"Cross-Validated MSE: {rf_cv_mse}")

    print("\nGradient Boosting Regressor Model Performance:")
    print(f"Mean Squared Error: {gb_mse}")
    print(f"R^2 Score: {gb_r2}")
    print(f"Cross-Validated MSE: {gb_cv_mse}")

# Run the main function
main()


Loading data...
Epoch 1/50


2024-05-28 21:54:35.980053: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-05-28 21:54:36.087028: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp_10.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


2024-05-28 21:56:13.669421: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.



Neural Network Model Performance:
Mean Squared Error: 28790.501735217047
R^2 Score: 0.14728668460600347

Linear Regression Model Performance:
Mean Squared Error: 29040.81043138801
R^2 Score: 0.13987307437625163
Cross-Validated MSE: 40508.19243933149

Ridge Regression Model Performance:
Mean Squared Error: 29047.195070714948
R^2 Score: 0.1396839749636003
Cross-Validated MSE: 40209.454990570775

Random Forest Regressor Model Performance:
Mean Squared Error: 29906.386665586586
R^2 Score: 0.11423655066513916
Cross-Validated MSE: 44971.98553370274

Gradient Boosting Regressor Model Performance:
Mean Squared Error: 28674.166439351913
R^2 Score: 0.15073228818550466
Cross-Validated MSE: 46511.196538084514
main took 104.45 seconds
