In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer  
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [24]:
cleaned_path = '/Users/xueyilu/Desktop/Project-4/Cleaned_data/cleaned_data.csv'

df = pd.read_csv(cleaned_path)

director_scores = df.groupby('director_name')['imdb_score'].mean().reset_index()

sorted_directors = director_scores.sort_values(by='imdb_score', ascending=False)

print(sorted_directors.head(10))

         director_name  imdb_score
1084    John Blanchard         9.5
1620  Mitchell Altieri         8.7
2011  Sadyk Sher-Niyaz         8.7
299          Cary Bell         8.7
1606      Mike Mayhall         8.6
315    Charles Chaplin         8.6
1417      Majid Majidi         8.5
1979        Ron Fricke         8.5
428    Damien Chazelle         8.5
1835        Raja Menon         8.5


In [30]:
X = df[['director_facebook_likes', 'num_critic_for_reviews', 'num_voted_users', 'movie_facebook_likes']] 
y = df['imdb_score']

# Preprocessing for numeric data
numeric_features = ['director_facebook_likes', 'num_critic_for_reviews', 'num_voted_users', 'movie_facebook_likes']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Create preprocessing and training pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)}")

# Create a DataFrame to compare actual and predicted values
comparison_df = pd.DataFrame({'Actual IMDb Score': y_test, 'Predicted IMDb Score': y_pred})
comparison_df = comparison_df.reset_index(drop=True)
print(comparison_df.head(30))


RMSE: 1.0191965228400772
    Actual IMDb Score  Predicted IMDb Score
0                 7.1              6.441213
1                 7.0              6.697981
2                 5.1              6.148519
3                 7.4              6.175872
4                 7.1              6.329049
5                 8.5              7.957976
6                 6.1              6.143567
7                 4.4              6.083896
8                 5.8              6.479220
9                 5.5              6.138016
10                7.8              6.127276
11                5.5              6.678912
12                6.2              6.275918
13                7.0              6.502596
14                5.8              6.308380
15                6.0              6.213442
16                7.1              6.402549
17                7.8              6.469566
18                6.5              6.208959
19                7.4              6.169278
20                5.4              6.347446
21     