In [2]:
# General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Visualization imports
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Datetime imports
from datetime import datetime

# Stats and autocorrelation tools
import statsmodels.api
from statsmodels.tsa.stattools import acf

In [2]:
# Helper functions that use for EDA
def get_info_and_stats(df):
    '''
    get_info_stats takes a DataFrame and returns information
    about the content of the DataFrame, the number of null
    values and a statistical description of the DataFrame
    '''
    print(df.info(), df.isna().mean(), df.describe(), sep='\n#############################################\n')

def multi_frequency(df,vars):
    '''
    multi_frequency takes a dataframe and a list of columns,
    return a dataframe with the count and the frequency of the missing data
    '''
    frequency=df[vars].isnull().sum()
    percentage=df[vars].isnull().sum()*100/(len(df))
    df=pd.concat([frequency,percentage], axis=1, keys=['num_rows_missing', 'pct_rows_missing'])
    return df

def value_counts(df):
    cat_cols = df.columns[[df[col].dtype == 'O' for col in df.columns]]
    for col in cat_cols:
        print(df[col].value_counts())
        print(df[col].value_counts(normalize=True, dropna=False))
        print('\n###################################\n')

In [10]:
def plot_continuous_distribution(data: pd.DataFrame = None, column: str = None, height: int = 12):
  _ = sns.displot(data, x=column, kde=True, height=height, aspect=height/5,
                  palette='colorblind').set(title=f'Distribution of {column}');

def get_unique_values(data, column):
  num_unique_values = len(data[column].unique())
  value_counts = data[column].value_counts()
  print(f"Column: {column} has {num_unique_values} unique values\n")
  print(value_counts, '\n')

def plot_categorical_distribution(data: pd.DataFrame = None, column: str = None, height: int = 8, aspect: int = 2):
  plot = sns.catplot(
      data=data,
      x=column, 
      kind='count', 
      height=height, aspect=aspect, palette='colorblind').set(title=f'Distribution of {column}')
  plot.set_xticklabels(rotation=90)

# This is the code from the Cognizant project 1st task
def correlation_plot(data: pd.DataFrame = None):
  corr = df.corr()
  corr.style.background_gradient(cmap='coolwarm')

# This is the same code that I modified to plot a triangular 
# heatmap (without doubling/repeat the results)

def correlation_plot(data: pd.DataFrame = None):
  corr = df.corr()
  f, ax = plt.subplots(figsize=(11, 9))
  # Mask for the values of the heatmap that repeat themselves
  mask = np.triu(np.ones_like(corr, dtype=bool))
  # Creating a diverging colormap
  cmap = sns.diverging_palette(200, 20, as_cmap=True)
  sns.heatmap(corr,  mask=mask, cmap=cmap, vmax=.3, center=0,
              square=True, linewidths=.5, cbar_kws={"shrink": .5})

'''
I use parts of the code below to make the
modification in the previous function above
'''
#     sns.set_theme(style="white")
#     # Generate a large random dataset
#     rs = np.random.RandomState(33)
#     d = pd.DataFrame(data=rs.normal(size=(100, 26)),
#                      columns=list(ascii_letters[26:]))
#     # Compute the correlation matrix
#     corr = d.corr()
#     # Generate a mask for the upper triangle
#     mask = np.triu(np.ones_like(corr, dtype=bool))=
#     # Set up the matplotlib figure
#     f, ax = plt.subplots(figsize=(11, 9))
#     # Generate a custom diverging colormap
#     cmap = sns.diverging_palette(230, 20, as_cmap=True)
#     # Draw the heatmap with the mask and correct aspect ratio
#     sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
#                 square=True, linewidths=.5, cbar_kws={"shrink": .5})


'''
This is an AUTOcorrelation plot function
'''
# (not a correlation plot function)

def auto_corr_plot(series, plot_pacf=False):
    '''
    using plotly express to plot an autocorrelation plot
    plot_pacf=is set at False to avoid using a 
    partial autocorrelation function
    '''
#     To add pacf use 'if plot_pacf else acf(series.dropna(), alpha=0.05)'
#   after the line below and make needed adjustment in the function
#   (plot_pacf=True for instance)
    corr_array = acf(series.dropna(), alpha=0.05)
    lower_y = corr_array[1][:,0] - corr_array[0]
    upper_y = corr_array[1][:,1] - corr_array[0]
# Setting up the plot in plotly and the theme and shape of the plot
    fig = go.Figure()
    [fig.add_scatter(x=(x,x), y=(0,corr_array[0][x]), mode='lines',line_color='#3f3f3f') 
     for x in range(len(corr_array[0]))]
    fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', marker_color='#1f77b4',
                   marker_size=12)
    fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', line_color='rgba(255,255,255,0)')
    fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',fillcolor='rgba(32, 146, 230,0.3)',
            fill='tonexty', line_color='rgba(255,255,255,0)')
    fig.update_traces(showlegend=False)
    fig.update_xaxes(range=[-1,42])
    fig.update_yaxes(zerolinecolor='#000000')
#   Setting up the title
    title='Autocorrelation (ACF)'
    fig.update_layout(title=title)
    fig.show()
    
    
# Function to convert a timestamp with no specific frequency (it has datetime(ns)),
# and then use strftime (setting up a new format specifier) and finally strptime
# to Parse the string into a datetime object with the same format

# Needs to be transformed from the code source to something more useful for me
def convert_timestamp_to_hourly(data: pd.DataFrame = None, column: str = None):
  dummy = data.copy()
  new_ts = dummy[column].tolist()
  new_ts = [i.strftime('%Y-%m-%d %H:00:00') for i in new_ts]
  new_ts = [datetime.strptime(i, '%Y-%m-%d %H:00:00') for i in new_ts]
  dummy[column] = new_ts
  return dummy

In [3]:
# Trying to make this function work from the one right above
def get_hourly_freq(df, column):
    col = df[column].tolist()
    for i in col:
        i = col.strftime('%Y-%m-%d %H:00:00')
        i = datetime.strptime(i, '%Y-%m-%d %H:00:00')
        
        
# This ☝🏿 has become this 👇🏿 but I don't know if it's gonna work yet
def get_hourly_freq(df, column):
    col = df[column].tolist()
    for i in col:
        col = i.strftime('%Y-%m-%d %H:00:00')
        col = datetime.strptime(i, '%Y-%m-%d %H:00:00')
        df[column] = col
    return df


In [14]:
# # Plotting with seaborn and rotating the x axis

# sns.set(rc={'figure.figsize':(17,10)})
# chart = sns.histplot(
#     data=df,
#     x='category',
#     palette='colorblind',
#     hue='category',
# )
# var = chart.set_xticklabels(chart.get_xticklabels(), rotation=45)

In [36]:
from category_encoders import *
import pandas as pd
from sklearn.datasets import load_boston

# prepare some data
bunch = load_boston()
y_train = bunch.target[0:250]
y_test = bunch.target[250:506]
X_train = pd.DataFrame(bunch.data[0:250], columns=bunch.feature_names)
X_test = pd.DataFrame(bunch.data[250:506], columns=bunch.feature_names)
# print(bunch)
# print(y_train)
# print(y_test)
print(X_train)
print(X_test)

        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0    0.14030  22.0   5.86   0.0  0.431  6.487  13.0  7.3967  7.0  330.0   
1    0.21409  22.0   5.86   0.0  0.431  6.438   8.9  7.3967  7.0  330.0   
2    0.08221  22.0   5.86   0.0  0.431  6.957   6.8  8.9067  7.0  330.0   
3    0.36894  22.0   5.86   0.0  0.431  8.259   8.4  8.9067  7.0  330.0   
4    0.04819  80.0   3.64   0.0  0.392  6.108  32.0  9.2203  1.0  315.0   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
251  0.06263   0.0  11.93   0.0  0.573  6.593  69.1  2.4786  1.0  273.0   
252  0.04527   0.0  11.93   0.0  0.573  6.120  76.7  2.2875  1.0  273.0   
253  0.06076   0.0  11.93   0.0  0.573  6.976  91.0  2.1675  1.0  273.0   
254  0.10959   0.0  11.93   0.0  0.573  6.794  89.3  2.3889  1.0  273.0   
255  0.04741   0.0  11.93   0.0  0.573  6.030  80.8  2.5050  1.0  273.0   

     PTRATIO       B  LSTAT  
0       19.1  396.28   5.90  
1       19.1  377.07   3.59  
2       1

In [32]:
# use target encoding to encode two categorical features
enc = TargetEncoder(cols=['CHAS', 'RAD'])



In [33]:
# transform the datasets
training_numeric_dataset = enc.fit_transform(X_train, y_train)
testing_numeric_dataset = enc.transform(X_test)
print(training_numeric_dataset)
print(testing_numeric_dataset)

        CRIM    ZN  INDUS       CHAS    NOX     RM   AGE     DIS        RAD  \
0    0.00632  18.0   2.31  24.248261  0.538  6.575  65.2  4.0900  27.615762   
1    0.02731   0.0   7.07  24.248261  0.469  6.421  78.9  4.9671  26.833333   
2    0.02729   0.0   7.07  24.248261  0.469  7.185  61.1  4.9671  26.833333   
3    0.03237   0.0   2.18  24.248261  0.458  6.998  45.8  6.0622  28.533333   
4    0.06905   0.0   2.18  24.248261  0.458  7.147  54.2  6.0622  28.533333   
..       ...   ...    ...        ...    ...    ...   ...     ...        ...   
245  0.19133  22.0   5.86  24.248261  0.431  5.605  70.2  7.9549  21.949066   
246  0.33983  22.0   5.86  24.248261  0.431  6.108  34.9  8.0555  21.949066   
247  0.19657  22.0   5.86  24.248261  0.431  6.226  79.2  8.0555  21.949066   
248  0.16439  22.0   5.86  24.248261  0.431  6.433  49.1  7.8265  21.949066   
249  0.19073  22.0   5.86  24.248261  0.431  6.718  17.5  7.8265  21.949066   

       TAX  PTRATIO       B  LSTAT  
0    296.0    