In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score

import matplotlib as plt
import tensorflow as tf

In [2]:
# matplotlib
%matplotlib inline
from matplotlib import style
style.use('fivethirtyeight')


# Pandas and Numpy
import numpy as np
import pandas as pd
import pandas_profiling as pp

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import psycopg2

ModuleNotFoundError: No module named 'pandas_profiling'

In [None]:
#import connection string (youll need to create this)
from config import conn

# create the engine
engine = create_engine(conn)
Base = automap_base()

# reflect an existing database into a new model
Base.prepare(engine, reflect=True)

# reflect the tables
#Recession_Indicator = Base.classes.recession_indicator_by_day
Recession_Indicator = Base.classes.us_recession
Historical_Price = Base.classes.historical_price_by_day

In [None]:
# Create function to conver the date
def convDate(elDate):
    return datetime.fromordinal(datetime(1900, 1, 1).toordinal() + elDate - 2)

In [None]:
# Create our session (link) from Python to the DB
session = Session(engine)

In [None]:
#create df from Historical_Price table

results = []

results = session.query(Historical_Price.date, \
                        Historical_Price.open, \
                        Historical_Price.high, \
                        Historical_Price.low, \
                        Historical_Price.close, \
                        Historical_Price.ticker)


price_df = pd.DataFrame(results).set_index(['date'])
price_df.head()

In [None]:
#create df from Recession_Indicator table
results = []

results = session.query(Recession_Indicator.date, \
                        Recession_Indicator.recession_indicator, \
                        Recession_Indicator.country)


#recession_df = pd.DataFrame(results).set_index(['country'])
recession_df = pd.DataFrame(results)

recession_df.head()

## Data Insights

In [None]:
# EDA on Historical Price Table, showing 64,708 observations and 4 characteristics
price_df.shape

In [None]:
# EDA to see label of each column
price_df.columns.values 

In [None]:
price_df.info()

- Data has only object values
- No variable column has null or missing values

## Summary Statistics

In [None]:
# EDA on Historical Price Table, beginning with basic statistics
price_df.describe()

### Key Observations
- dataset comprises of 64,708 observations and 4 chracteriestics
- out of which x is dependent variable and the remaining x are independent variables
- Data needs to be redone.  Not capturing mean, Standard deviation etc

In [None]:
# EDA on Historical Price Table, to get further info on the data showing that no variable column has null values.
price_df.info()

In [None]:
# EDA on Historical Price Table continued with pandas-profiling
pp.ProfileReport(price_df)

In [None]:
# EDA on Historical Recession Indicator, beginning with basic statistics
recession_df.describe()

In [None]:
# EDA on Historical Recession Indicator continued with pandas-profiling
pp.ProfileReport(recession_df)

## EDA Data Visualizations


In [None]:

# create the engine
engine = create_engine(conn)
Base = automap_base()

# reflect an existing database into a new model
Base.prepare(engine, reflect=True)

# reflect the tables
#Recession_Indicator = Base.classes.recession_indicator_by_day
Recession_Indicator = Base.classes.us_recession
Historical_Price = Base.classes.historical_price_by_day

In [None]:
X = recession_df.copy()
X = X.drop("recession_indicator", axis=1)
X = pd.get_dummies(X, columns=["country"])
X.head()

In [None]:
y_cols = recession_df["recession_indicator"] 

y = pd.DataFrame()

y["recession_indicator"] = y_cols
y.head()

In [None]:
y['recession_indicator'].value_counts()

In [None]:
# Create X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_cols, random_state=1)

In [None]:
from sklearn.linear_model import LogisticRegression

# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs", max_iter=200)

# Train the model
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")



## Check for missing values

In [None]:
sns.heatmap( {INSERT DF HERE} .isnull(),cbar=False,yticklabels=False,cmap = 'viridis')

## Checking Correlation

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap( {INSERT OUR DF HERE} .corr(),cmap='Blues',annot=False)

In [None]:
#Quality correlation matrix
k = #number of variables for heatmap
cols = {OUR DF HERE} .corr().nlargest(k, 'quality')['quality'].index
cm = {""}[cols].corr()
plt.figure(figsize=(10,6))
sns.heatmap(cm, annot=True, cmap = 'viridis')

## Checking for outliers

In [None]:
l = {INSERT DF HERE} .columns.values
number_of_columns=12
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.set_style('whitegrid')
    sns.boxplot(df[l[i]],color='red',orient='v')
    plt.tight_layout()

## Checking skewness

In [None]:
plt.figure(figsize=(2*number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.distplot({INSERT OUR DF HERE}[l[i]],kde=True) 