## Welcome to our Jupyter Notebook

# Jupyter Cells

**Markdown** and **Code** 

In [None]:
# Imports 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [6, 4]

from IPython.display import Image

from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

import torch
import torch.nn as nn
import torch.optim as optim

### Our Data
Coutesry of **Kaggle**! 

    A machine learning and data science community. 

I hope you like movies.

### Loading in data

Popular file type: comma-seperated value (**.csv**) files.

Use the pandas read_csv function to load the .csv file as a dataframe.

    df = pd.read_csv('foo.csv')


In [None]:
# load the file "imdb_movies.csv" to a variable named df.

df = pd.read_csv('imdb_movies.csv')

In [None]:
# Look at the top ten columns and see if how many of the movies you recognize.

df.head()

### Columns and Index

In [None]:
# Print out the columns and index of the DataFrame below.
print("Number of columns: ", len(df.columns))
print("Number of rows: ", len(df.index))

print("\n", df.columns, "\n\n")
print(df.index,"\n")

#### Exploring Columns

    df['column_name']

or specific values in columns:

    df.loc['index', 'column_name']

In [None]:
# Print out the name of the 30th entry of the df 
df.loc[30, 'names']

#### Slicing 
What if we want to inspect **multiple rows** or **multiple columns**?


In [None]:
# Print out the names first 10 and the last 10 rows

print(df.loc[6:10, 'names'])

#### Adding Columns

In [None]:
# Create budget and revenue columns in units of a million dollars
df["budget_m"] = df["budget_x"]/1000000
df["revenue_m"] = df["revenue"]/1000000

In [None]:
df["budget_m"]

#### Mathematical Operations

    sum(): Compute the sum of values.
    mean(): Compute the mean of values.
    median(): Compute the median of values.
    std(): Compute the standard deviation.
    var(): Compute the variance.
    min(): Find the minimum value.
    max(): Find the maximum value.
    count(): Count the number of non-NA/null observations.


In [None]:
# your code here
total_budget = df["budget_m"].sum()
print(total_budget)

total_rev = df["revenue_m"].sum()
print(total_rev)

print("\n", "Ratio of budget to revenue: " ,total_rev/total_budget)

In [None]:
df["revenue_m"].min()

In [None]:
df["revenue_m"].max()

In [None]:
df.nlargest(4, "revenue_m")[["names", "revenue_m"]]

In [None]:
Image(filename="assets/winner_400.png")

In [None]:
bins = plt.hist(df["budget_m"], bins=100)
plt.title("Historgram of Film Budgets")
plt.xlabel("Budget (million $)")
plt.ylabel("# of films")
plt.show()

In [None]:
bins = plt.hist(df["revenue_m"], bins=100)
plt.title("Historgram of Film Revenues")
plt.xlabel("Revenue (million $)")
plt.ylabel("# of films")
plt.show()

#### Feature engineering

    profit = revenue - budget



In [None]:
# Make a column called profit and add it to the DataFrame.

df["profit_m"] = df["revenue_m"] - df["budget_m"]


In [None]:
df["profit_m"]

In [None]:
df.nsmallest(4, "profit_m")[["names", "profit_m"]]

#### Masking
Creating masks is a convinient way to filter through data. 

By creating a condition and running over the data, we can create a mask of True/False statements.

Operators in python are:

    greater-than: >
    less-than: <
    equal: ==
    not equal: !=
    greater-than equal: >=
    less-than equal: <=

Lets see how they work in action!

In [None]:
import numpy as np

# Create an example array
ex = np.array([0,1,2,3,4,5,6])
print("This is the example array: ", ex)

# Set a condition, greater-than or equal to three
mask = ex >= 3
print("This is the mask: ", mask)

# use the mask to filter through the results
print("These are the values in the array which pass the condition: ", ex[mask])

Pandas is built to handle masks as well!

In [None]:
# search for films that profited over 250 million dollars

# define the mask
mask = df["profit_m"] > 350

# We can use the count method to see how many films managed to earn that much
df.loc[mask,"profit_m"].count()

In [None]:
# Determine how many films netted a negative profit
mask_neg_profit = df["profit_m"] < 0
df.loc[mask_neg_profit, "profit_m"].count()

### Avioding multiple masks: Query

Built in method for handling multiple conditions

    df.query()

In [None]:
# The number of films at a loss cost under 2 million dollars to produce.

df.query('budget_m > 100 and profit_m < 0')["profit_m"].count()

### Datetime
Time is an important factor in determining trends.

Pandas is ready for that!

#### pd.to_datetime()

The pd.to_datetime() function turns strings that meet a certain criteria into Timestamp objects. 

     pd.to_datetime("date_column")

This will output a series containing datetime objects.

In [None]:
df["date_x"][0]

In [None]:
# Use the pd.to_datetime function to define a new column in the DataFrame called "date_time" containing Timestamped objects

df["date_time"] = pd.to_datetime(df["date_x"])
type(df["date_time"][0])

In [None]:
df.loc[0,"date_time"].month

In [None]:
# Define lists to store values
year = []
month = []
day = []

# loop through Timestamp data
for date in df["date_time"]:
    # Seperate year and month attributes and store appropriatly
    year.append(date.year)
    month.append(date.month)
    day.append(date.day)

# create new columns in df 
df["year"] = year
df["month"] = month
df["day"] = day

In [None]:
bins = plt.hist(df["day"], bins=31)
plt.title("Historgram of Film Revenues")
plt.xlabel("Day of the month")
plt.ylabel("# of films")
plt.show()

In [None]:
# Mean budget for 2022 and 2023

mask = df["year"] == 2022 
print("mean budget 2022: ", df.loc[mask, "budget_m"].mean())

mask = df["year"] == 2023
print("mean budget 2023", df.loc[mask, "budget_m"].mean())


### Mean budgets for every year?


In [None]:
len(df["year"].unique())

### To many masks

There must be a better way to run these queries and calculations. 

#### Groupby

        df.groupby("year")

The **unique values** of the column are the new index.

For categorical data.


        df.groupby("year").agg('mean')

        df.groupby("year")["budget_m"].agg('mean')

        df.groupby("year")["budget_m"].agg(['sum','count','mean'])


In [None]:
# Run the command listed above where we group the data by the year of their release date

by_year = df.groupby("year")
budget_by_year = by_year["budget_m"].agg('mean')
budget_by_year

In [None]:
by_year = df.groupby("year")
budget_by_year = by_year["budget_m"].agg(['std','mean','count','sum'])
budget_by_year

In [None]:
plt.plot(budget_by_year["mean"])
plt.title("Film Industry Budgets by Year")
plt.xlabel("Year")
plt.ylabel("Budget (million $)")
plt.show()

## Additional Feature Engineering: 

### apply
    
    df.[column_name].apply(function_name)

In [None]:
# Function to assign grades based on score
def assign_grade(score):
    thresh = [97, 93, 90, 87, 83, 80, 77, 73, 70, 67, 63, 60, 0]
    grades = ['+A','A','-A','+B','B','-B','+C','C','-C','+D', 'D', '-D', 'F']
    
    i=0
    
    while True:
        if score >= thresh[i]:
            return grades[i]
        i+=1

In [None]:
# Use the apply function on the score column to create a new grade column
df["grades"] = df["score"].apply(assign_grade)
df["grades"]

In [None]:
# Function to assign grades based on score
def assign_grade_code(score):
    thresh = [97, 93, 90, 87, 83, 80, 77, 73, 70, 67, 63, 60, 0]
    grades = range(1,14)
    i=0
    
    while True:
        if score >= thresh[i]:
            return grades[i]
        i+=1

In [None]:
df["grade_code"] = df["score"].apply(assign_grade_code)
df["grade_code"]

In [None]:
# Make you're own groupby object - and sorry that the genre column is difficult to work with
grades = ['+A','A','-A','+B','B','-B','+C','C','-C','+D', 'D', '-D', 'F']
by_grade = df.groupby("grades")
rev_by_grade = by_grade["revenue_m"].agg("mean")
organized_data = []

for grade in grades:
    organized_data.append(rev_by_grade.loc[grade])

plt.plot(grades, organized_data)
plt.title("Mean Revenue")
plt.xlabel("Grade")
plt.ylabel("Revenue (million $)")
plt.show()

In [None]:
# You can use lambda function within apply - lets use that to show a more elegant method to replace that nasty for-loop we used on the dates

# Lambda are small annonymous functions. They work as: lambda input : output. Apply will funnel in all of the dates as input.
df["year"] = df["date_time"].apply(lambda x : x.year)
df["month"] = df["date_time"].apply(lambda x : x.month)

# This was rushed so don't worry if you want to leave it for now.

In [None]:
df["genre"][0]

In [None]:
# Here is some code to make a new genre column which is a bit easier to work with
def genre_to_list(genre):
    if type(genre) != str:
        return "-"
    return genre.replace(',\xa0' , ' ').split()

df["genre_list"] = df["genre"].apply(genre_to_list)
df["genre_list"]

In [None]:
# Plotting script for line plot

def plot_y_by_x(x, y, std=None, x_label="Year", y_label="Mean Budget", unit='(million $)'):
    # Set the style of the plot
    sns.set(style="whitegrid")

    # Create the plot
    plt.figure(figsize=(8, 5))

    # Plot mean budget
    sns.lineplot(x=x, y=y, marker='o', label=y_label, color='b')

    # Fill the area between mean_budget ± std_budget
    if std is not None:
        plt.fill_between(x,
                 y - std,
                 y + std,
                 color='b', alpha=0.2, label='Std Dev')

    # Customize the plot
    plt.title(f'{y_label} by {x_label}')
    plt.xlabel(x_label)
    plt.ylabel(f"{y_label} {unit}")
    plt.legend()

    # Display the plot
    plt.show()

In [None]:
# Lets first plot the average price of a movie by year. We can also include the standard deviarion 
y = budget_by_year["mean"]
std = budget_by_year["std"]
x = budget_by_year.index
plot_y_by_x(x, y, std=std)

In [None]:
y = budget_by_year["sum"]
plot_y_by_x(x, y, y_label="Sum Budget", unit="(million $)")

In [None]:
# Can you use the plotting script to plot the number of films released per year
y = budget_by_year["count"]
# your code here
plot_y_by_x(x, y, std=None, x_label="year", y_label="Films Released", unit = " ")

### ML packages

Scikit-learn has some great ML algorithms premade!

#### One hot encoding
The genre category seems like such a fun feature to inspect.  

Many algorithms require a preprocessing step to encode categorical data to a algorithmically readable format. 

A popular method to doing this is one-hot encoding.

    convert categorical data into an binary array where indecies in the array corresponds to thWee unique values of the feature in question. 

    colors = [
            'blue',
            'red',
            'yellow',
            'yellow',
            'red'
            ]
            
    colors_one_hot = [
            [1,0,0],
            [0,1,0]
            [0,0,1]
            [0,0,1]
            [0,1,0]
            ]

In our case, where we have multiple genres for a single movie, we can add their one-hot vectors together to tell the model to consider attributes from both. 

This method is in imperfect because it does not factor in the potential coupling terms which likely emerge when combining genres together. 

In [None]:
month_one_hot = (pd.get_dummies(df['month'].apply(pd.Series).stack()) + 0).reset_index(drop=True)
month_one_hot

In [None]:
year_one_hot = (pd.get_dummies(df['year'].apply(pd.Series).stack()) + 0).reset_index(drop=True)

In [None]:
country_one_hot = (pd.get_dummies(df['country'].apply(pd.Series).stack()) + 0).reset_index(drop=True)

In [None]:
grades_one_hot = (pd.get_dummies(df['grades'].apply(pd.Series).stack()) + 0).reset_index(drop=True)

In [None]:
genre_one_hot = pd.get_dummies(df['genre_list'].apply(pd.Series).stack()).groupby(level=0).sum()

In [None]:
def lang_to_list(lang):
    if type(lang) != str:
        return "-"
    return lang.replace(',\xa0' , ' ').split()

df["lang_list"] = df["orig_lang"].apply(lang_to_list).apply(lambda x : x[0])

lang_one_hot = (pd.get_dummies(df['lang_list'].apply(pd.Series).stack()) + 0).reset_index(drop=True)

In [None]:
X_standard = StandardScaler().fit_transform(df[['revenue_m','budget_m','profit_m','score']])

df_analysis = pd.concat([
                        pd.DataFrame(X_standard),
                        month_one_hot, 
                        year_one_hot,
                        genre_one_hot,
                        grades_one_hot,
                        lang_one_hot,
                        country_one_hot], 
                        
                        axis=1,
                        ignore_index=True
                       )

In [None]:
df_analysis

### Outliers

Outliers can sometimes distrupt a models ability to pick up on trends

In [None]:
continuous_features = df.select_dtypes(include=[np.number]).columns.tolist()

continuous_features.remove('year')
continuous_features.remove('month')
continuous_features.remove('budget_x')
continuous_features.remove('score')

X_scaled = StandardScaler().fit_transform(df[continuous_features])

# Calculate Z-scores for the relevant columns
z_scores = np.abs(X_scaled)

# Define a threshold for outliers
threshold = 3

# Create a mask to identify non-outliers
outlier_mask = (z_scores < threshold).all(axis=1)

In [None]:
X_cleaned = np.array(df_analysis)[outlier_mask]

In [None]:
perp = 50

# t-SNE
tsne = TSNE(n_components=2, perplexity=perp, random_state=42)
tsne_results = tsne.fit_transform(X_cleaned)

# Visualization
plt.figure(figsize=(10, 6))
scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], alpha=0.7, s=4)
plt.title(f"t-SNE Visualization, Perplexity: {perp}")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
#plt.colorbar(scatter, label='Target Labels') 
plt.show()

In [None]:
# Convert text categories to numeric labels
label_encoder = LabelEncoder()
label = label_encoder.fit_transform(df[outlier_mask]['country'])

# Visualization
plt.figure(figsize=(10, 6))
scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=label, alpha=0.7, s=4)
plt.title(f"t-SNE Visualization, Country")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")


# Create colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(len(label_encoder.classes_)))  # Set the ticks to the number of classes
cbar.set_ticklabels(label_encoder.classes_, size=6)  # Set the labels to the original category names

plt.show()

In [None]:
label

In [None]:
# Step 1: Convert text categories to numeric labels
label_encoder = LabelEncoder()
label = label_encoder.fit_transform(df[outlier_mask]['grade_code'])

# Visualization
plt.figure(figsize=(10, 6))
scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=label, alpha=0.7, s=4)
plt.title(f"t-SNE Visualization, Grades")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")


# Create colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(len(grades)))  # Set the ticks to the number of classes
cbar.set_ticklabels(grades, size=7)  # Set the labels to the original category names

plt.show()

### SVM

In [None]:
#X = df[['budget_m','score']]
X = df[['budget_m']]
y = df['revenue_m']

X = pd.concat([X, 
               month_one_hot, 
                year_one_hot,
                genre_one_hot,
                lang_one_hot,
                country_one_hot], 
                
                axis=1,
                ignore_index=True
                       )

# First, split into training+validation and test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2,
                                                    shuffle=True)

# Then, split the training+validation into training and validation
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, 
                                                shuffle=True)

scaler_x = MinMaxScaler()
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)
X_val = scaler_x.transform(X_val)

y_train_lower, y_train_upper = np.quantile(y_train, [0.1, 0.9])
mask_train = (y_train > y_train_lower) & (y_train < y_train_upper)
X_train = X_train[mask_train, :]
y_train = y_train[mask_train]

mask_test = (y_test > y_train_lower) & (y_test < y_train_upper)
X_test = X_test[mask_test, :]
y_test = y_test[mask_test]

mask_val = (y_val > y_train_lower) & (y_val < y_train_upper)
X_val = X_val[mask_val, :]
y_val = y_val[mask_val]

scaler_y = MinMaxScaler()
y_train = scaler_y.fit_transform(np.expand_dims(y_train, axis=1))
y_train = np.squeeze(y_train)

y_test = scaler_y.transform(np.expand_dims(y_test, axis=1))
y_test = np.squeeze(y_test)

y_val = scaler_y.transform(np.expand_dims(y_val, axis=1))
y_val = np.squeeze(y_val)

model = SVR(kernel='rbf')
model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_r2 = r2_score(y_val, y_val_pred)

print(f'Validation Mean Squared Error: {val_mse:.4f}')
print(f'Validation R^2 Score: {val_r2:.4f}')

# Evaluate on the test set
y_test_pred = model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Test Mean Squared Error: {test_mse:.4f}')
print(f'Test R^2 Score: {test_r2:.4f}')

In [None]:
budget = df[['budget_m']]
y = df['revenue_m']

X = pd.concat([
                budget, 
                month_one_hot, 
                year_one_hot,
                genre_one_hot,
                lang_one_hot,
                country_one_hot], 
                
                axis=1,
                ignore_index=True
                       )

In [None]:
# First, split into training+validation and test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2,
                                                    shuffle=True)

# Then, split the training+validation into training and validation
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, 
                                                shuffle=True)

In [None]:
scaler_x = MinMaxScaler()
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)
X_val = scaler_x.transform(X_val)

In [None]:
y_train_lower, y_train_upper = np.quantile(y_train, [0.0, 1.0])
mask_train = (y_train > y_train_lower) & (y_train < y_train_upper)
X_train = X_train[mask_train, :]
y_train = y_train[mask_train]

mask_test = (y_test > y_train_lower) & (y_test < y_train_upper)
X_test = X_test[mask_test, :]
y_test = y_test[mask_test]

mask_val = (y_val > y_train_lower) & (y_val < y_train_upper)
X_val = X_val[mask_val, :]
y_val = y_val[mask_val]

scaler_y = MinMaxScaler()
y_train = scaler_y.fit_transform(np.expand_dims(y_train, axis=1))
y_train = np.squeeze(y_train)

y_test = scaler_y.transform(np.expand_dims(y_test, axis=1))
y_test = np.squeeze(y_test)

y_val = scaler_y.transform(np.expand_dims(y_val, axis=1))
y_val = np.squeeze(y_val)

In [None]:
model = SVR(kernel='rbf')
model.fit(X_train, y_train)

In [None]:
y_val_pred = model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_r2 = r2_score(y_val, y_val_pred)

print(f'Validation Mean Squared Error: {val_mse:.4f}')
print(f'Validation R^2 Score: {val_r2:.4f}')

# Evaluate on the test set
y_test_pred = model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Test Mean Squared Error: {test_mse:.4f}')
print(f'Test R^2 Score: {test_r2:.4f}')

In [None]:
print("Pearson Correlation Coefficient, Validation: ", np.corrcoef(y_val, y_val_pred)[0,1])

In [None]:
# Assuming you have your predictions in `y_pred` and actual values in `y_true`
m, b = np.polyfit(y_test, y_test_pred, 1)
x = np.linspace(min(y_test), max(y_test), 30)

plt.scatter(y_test, y_test_pred, s=6)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('True Values vs Predictions')
plt.plot(x, m*x+b, color='red')  # Identity line
plt.show()

In [None]:
y_val_inv = scaler_y.inverse_transform(np.expand_dims(y_val, axis=1))
y_val_inv_pred = scaler_y.inverse_transform(np.expand_dims(y_val_pred, axis=1))

mse_inv = mean_squared_error(y_val_inv, y_val_inv_pred)
print(np.sqrt(mse_inv))

df["revenue_m"].mean()

In [None]:
# Assuming you have your predictions in `y_pred` and actual values in `y_true`
m, b = np.polyfit(np.squeeze(y_val_inv), np.squeeze(y_val_inv_pred), 1)
x = np.linspace(min(y_val_inv), max(y_val_inv), 30)

plt.scatter(y_val_inv, y_val_inv_pred, s=6)
plt.xlabel('True Values')
plt.ylabel('Predictions (million $)')
plt.title('True Values vs Predictions')
plt.plot(x, m*x+b, color='red')  # Identity line
plt.show()

### Neural Network

Neural networks can also work really well for regression estimation tasks.

We can use the Pytorch library to build our model! 

In [None]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [None]:
# Step 2: Define the DNN model
class SimpleDNN(nn.Module):
    def __init__(self):
        super(SimpleDNN, self).__init__()
        self.fc1 = nn.Linear(X_train_tensor.shape[1], 128)  # Input layer to hidden layer
        self.fc2 = nn.Linear(128, 64) 
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, 1)                         # Hidden layer to output

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # Activation for first layer
        x = torch.relu(self.fc2(x))  # Activation for second layer
        x = torch.relu(self.fc3(x))  # Activation for third layer
        x = torch.relu(self.fc4(x))  # Activation for fourth layer
        x = self.fc5(x)               # Output layer
        return x


In [None]:
model = SimpleDNN()
criterion = nn.MSELoss()  # Mean Squared Error Loss
optimizer = optim.Adam(model.parameters(), lr=0.0001)

num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    
    optimizer.zero_grad()  # Clear the gradients
    outputs = model(X_train_tensor)  # Forward pass
    loss = criterion(outputs, y_train_tensor)  # Calculate loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update weights

    # Print training progress
    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:
# Evaluate the model on the validation set
model.eval()
with torch.no_grad():
    val_outputs = model(X_val_tensor)
    val_loss = criterion(val_outputs, y_val_tensor)

print(f'Validation Loss (MSE): {val_loss.item():.4f}')

# Optional: Evaluate on the test set
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    test_loss = criterion(test_outputs, y_test_tensor)

print(f'Test Loss (MSE): {test_loss.item():.4f}')

print("\nPearson correlation coefficient, Validation: ", np.corrcoef(np.array(y_val_tensor)[:,0], np.array(val_outputs)[:,0])[0,1])
val_r2 = r2_score(np.array(y_val_tensor)[:,0], np.array(val_outputs)[:,0])
print(f'Validation R^2 Score: {val_r2:.4f}')


print("\nPearson correlation coefficient, Test: ", np.corrcoef(np.array(y_test_tensor)[:,0], np.array(test_outputs)[:,0])[0,1])
test_r2 = r2_score(np.array(y_test_tensor)[:,0], np.array(test_outputs)[:,0])
print(f'Test R^2 Score: {test_r2:.4f}')

#### Great!

That's all for now - I really hope you enjoyed this tour of Pandas and that you found the exercises useful. 

There is still so much left unexplored. If you can dream it - you can probably do it with Pandas. 