In [1]:
import pandas as pd

path = r'/Users/yourgrandpaprogenynamesmcky/Developer/BIG_DATA_AI/HW2/Data/HW3_movie_metadata-661316-17373502460835.csv'
df = pd.read_csv(path)

In [4]:
df.fillna(0, inplace=True)

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib.backends.backend_pdf import PdfPages

# Load data
path = r'/Users/yourgrandpaprogenynamesmcky/Developer/BIG_DATA_AI/HW2/Data/HW3_movie_metadata-661316-17373502460835.csv'
data = pd.read_csv(path)

# Prepare PDF for saving outputs
output_pdf = 'HW3_results.pdf'
with PdfPages(output_pdf) as pdf:
    # Explore data
    data_head = data.head()
    data_tail = data.tail()
    data_info = data.info()
    data_description = data.describe()
    missing_values = data.isna().sum()

    # Add text summaries to the PDF
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.axis('off')
    ax.text(0, 1, f"Dataset Head:\n{data_head}\n\nDataset Tail:\n{data_tail}\n\nDataset Info:\n{data_info}\n\nDataset Description:\n{data_description}\n\nMissing Values:\n{missing_values}", fontsize=8, wrap=True)
    pdf.savefig(fig)
    plt.close()

    # Plot value distributions
    numerical_features = data.select_dtypes(include=[np.number]).columns
    for feature in numerical_features:
        plt.figure()
        sns.histplot(data[feature], kde=True)
        plt.title(f"Distribution of {feature}")
        pdf.savefig()  # Save each plot to the PDF
        plt.close()

    # Preprocess data
    for col in numerical_features:
        data[col].fillna(data[col].median(), inplace=True)

    categorical_features = data.select_dtypes(include=['object']).columns
    for col in categorical_features:
        if col in data.columns:
            data[col].fillna("Unknown", inplace=True)
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col])

    # Correlation matrix
    correlation_matrix = data.corr()
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
    plt.title("Correlation Matrix")
    pdf.savefig()
    plt.close()

    # Select features for training
    target = 'num_critic_for_reviews'
    correlated_features = correlation_matrix[target][(correlation_matrix[target] > 0.5) | (correlation_matrix[target] < -0.5)].index.tolist()
    correlated_features.remove(target)
    print("Selected features:", correlated_features)

    # Prepare train and test sets
    X = data[correlated_features]
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale numerical features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Model training
    lr_model = LinearRegression()
    lr_model.fit(X_train_scaled, y_train)

    y_pred = lr_model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Add results to the PDF
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.axis('off')
    ax.text(0, 1, f"Mean Squared Error: {mse}\nR2 Score: {r2}\n", fontsize=10, wrap=True)
    pdf.savefig(fig)
    plt.close()

    # Coefficients
    coefficients = pd.DataFrame({"Feature": correlated_features, "Coefficient": lr_model.coef_})
    coefficients = coefficients.sort_values(by="Coefficient", ascending=False)
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.axis('off')
    ax.table(cellText=coefficients.values, colLabels=coefficients.columns, loc='center')
    ax.set_title("Regression Coefficients")
    pdf.savefig(fig)
    plt.close()

    # Visualize predictions vs actual with a regression line
    data_plot = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})

    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=data_plot, x="Actual", y="Predicted", label="Predictions", color="blue")
    plt.plot([data_plot["Actual"].min(), data_plot["Actual"].max()], 
            [data_plot["Actual"].min(), data_plot["Actual"].max()], 
            color="red", label="Perfect Fit (y=x)", linestyle="--")
    plt.title("Actual vs Predicted with Regression Line")
    plt.xlabel("Actual Number of Critical Reviews")
    plt.ylabel("Predicted Number of Critical Reviews")
    plt.legend()
    plt.grid(True)

    # Save the last graph to the PDF
    pdf.savefig()  # This saves the current figure to the PDF
    plt.close()

print(f"Results saved to {output_pdf}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   director_name              4939 non-null   object 
 1   num_critic_for_reviews     4993 non-null   float64
 2   duration                   5028 non-null   float64
 3   director_facebook_likes    4939 non-null   float64
 4   actor_3_facebook_likes     5020 non-null   float64
 5   actor_2_name               5030 non-null   object 
 6   actor_1_facebook_likes     5036 non-null   float64
 7   gross                      4159 non-null   float64
 8   genres                     5043 non-null   object 
 9   actor_1_name               5036 non-null   object 
 10  movie_title                5043 non-null   object 
 11  num_voted_users            5043 non-null   int64  
 12  cast_total_facebook_likes  5043 non-null   int64  
 13  actor_3_name               5020 non-null   objec

  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):


Selected features: ['num_voted_users', 'num_user_for_reviews', 'movie_facebook_likes']
Results saved to HW3_results.pdf
