In [None]:
# ! /usr/local/anaconda3/bin/python3 -m pip install missingno scikit-learn matplotlib seaborn numpy pandas tensorflow==2.12 blosc2==2.0.0 cython==0.29.21 FuzzyTM==1.0
# /usr/local/anaconda3/bin/python3 -m pip show tensorflow

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score

import tensorflow as tf
from tensorflow.keras import layers

pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format
# tf.keras.backend.set_floatx('float32')


In [None]:
train_df = pd.read_csv('Comedy_bang_bang_podcast_dataset - full_dataset-v15.csv')
train_df.head()

In [None]:
len(train_df)

In [None]:
# drop some unused features, data filtering
# this only works if the columns exist
train_df.drop(['year_elligible_for_best_of','episode_title','synopsis_and_segments','fandom_wikia_suffix','weight_flat','weight_inverse','weight_linear','best_of_rank'], axis=1, inplace=True)

train_df['date_episode_published'] = pd.to_datetime(train_df['date_episode_published'])
# convert it again to avoid later issues:
# https://stackoverflow.com/questions/69282305/how-do-i-solve-this-error-typeerror-float-argument-must-be-a-string-or-a-num
train_df['date_episode_published'] = train_df['date_episode_published'].apply(pd.Timestamp.timestamp)

# see if there are any null values in the data set
train_df.isnull().sum()

In [None]:

train_df.drop(train_df.loc[train_df['data_set']=='ignored'].index, inplace=True)
train_df.drop(train_df.loc[train_df['data_set']=='prediction'].index, inplace=True)

# don't need this column anymore
train_df.drop(['data_set'], axis=1, inplace=True)

# students.grade = students.grade.astype('int64')

train_df.head()

In [None]:
# one-hot encode the episode_type
train_df = pd.get_dummies(train_df, columns=['episode_type', ])

#   episode_type
# TODO: run my other python function to manipulate the guests_and_characters_from_wikipedia_semicolon_delimited column
train_df.drop(['guests_and_characters_from_wikipedia_semicolon_delimited'], axis=1, inplace=True)


new_column_order = ['episode_number', 'date_episode_published', 'duration_in_minutes', 'episode_type_anniversary', 
                    'episode_type_guest_host', 'episode_type_hoilday', 'episode_type_holiday', 'episode_type_live', 
                    'episode_type_regular', 'episode_type_special', 'episode_type_takeover', 
                    'is_on_best_of_boolean']
# train_df = train_df[new_column_order]
train_df = train_df.reindex(columns=new_column_order)

train_df.head()

# reorder columns so that the 
# df = pd.DataFrame(technologies)
# temp_cols=df.columns.tolist()
# index=df.columns.get_loc("Duration")
# new_cols=temp_cols[index:index+1] + temp_cols[0:index] + temp_cols[index+1:]
# df=df[new_cols]

In [None]:
# NUMBER_OF_SPLITS = 5
# for i, new_df in enumerate(np.array_split(train_df,NUMBER_OF_SPLITS)):
#     with open(f"out{i}.csv","w") as fo:
#             fo.write(new_df.to_csv())

In [None]:
# Missing data detection
msno.matrix(train_df, figsize = (10,3))

We see that all the bars in the above picture have the same height and thus, we conclude that we don't have any missing data.

### Lets see the Distribution of the data

In [None]:
fig, axes = plt.subplots(nrows = 2, ncols =1)
fig.set_size_inches(20,30)
sns.boxplot(data=train_df,orient="v",ax=axes[0]) # To see if we need to scale our data
sns.boxplot(data=train_df, y = "is_on_best_of_boolean", orient = "pH", ax=axes[1]) # to see the distribution of quality

In the first figure, we see that there are a lot of outliers as dots in the figure. It seems that there
is a need to scale the data as we cannot see other feature values. 

In the second figure, we see that there are 3 outliers and the distribution of wine quality. Most of the feature values belong to the wine 
quality between 5 and 6. 

### Let us now see the correlation of features and then we need to remove the highly correlated features.

In [None]:
corr_mat = train_df.corr()
mask = np.array(corr_mat)
mask[np.tril_indices_from(mask)] = False
fig, ax = plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(corr_mat, mask = mask, vmax = 0.8, square = True, annot = True)

The density and residual sugar have high dependency.

### Splitting the data into train and test and training the model

In [None]:
# label_column_name='is_on_best_of_boolean'
X = train_df.iloc[:, :-1]
y = train_df.iloc[: , -1]

In [None]:
## Adding an extra column for the constant used in calculation of Linear Regression
X = np.append(arr = np.ones((X.shape[0],1)), values = X, axis = 1)

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
X_train.shape

In [None]:
X_test.shape

### Scaling the data

In [None]:
scaler = StandardScaler()
# standardscaler fit_transform TypeError float argument must be a string or a real number not "Timestamp"

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### Building the model

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

### Use of pipeline for model building

### Evaluating the model

In [None]:
r2_score(y_test, y_pred)

### Analyzing the results with visualizations

In [None]:
plt.scatter(y_test, y_pred, c ='g')
plt.xlabel('True Quality')
plt.ylabel('Predicted Quality')
plt.title('Predicted quality vs True quality')
plt.show()