In [None]:
from app import data_imports
from app import data_cleaning
from app import summary_stats   
from app import models
from app import visualisations

# Where is the dataset?

In [None]:
filepath = 'examples/data_science_salaries.csv'

In [None]:
import os

if os.path.isfile(filepath):
    print('\033[92;1m' + 'File successfully found!' + '\033[0m')
else:
    raise SystemExit("File not found. Stopping execution.")


In [None]:
df = data_imports.read_file(filepath)

cleaner = data_cleaning.DataCleaner(df)

cleaner.clean()

print(cleaner.get_log())

In [None]:
cols = df.columns.tolist()

print("The columns in the dataset are:\n")
cols_str = '\n'.join([f"- {col}" for col in cols])
print(cols_str)

# Do you have interest in any specific variable?

It is not practical to consider all relationships at the same time. 

**You do not have to specify any variables at all.** The explanatory statistics will then be applied to all variables at once. Note that the modelling will, however, be limited.

In [None]:
# e.g. target_variable = 'price'
target_variable = ''

# e.g. other_variables_of_interest = ['sqft_living', 'bedrooms', 'bathrooms']
other_variables_of_interest = []#'b', 'lstat', 'rm']

In [None]:
if not target_variable and not other_variables_of_interest:
    print('\033[91;1m' + "Both target_variable and other_variables_of_interest are empty." + '\033[0m')
elif not target_variable and other_variables_of_interest:
    print('\033[91;1m' + "Target variable has not been specified. Proceeding with the first column in the dataset." + '\033[0m')
    target_variable = df.columns[0] 
elif target_variable and not other_variables_of_interest:
    other_variables_of_interest = df.columns.tolist()
    print('\033[91;1m' + "Other variables of interest have not been specified. Proceeding with the whole dataset." + '\033[0m')

if target_variable in other_variables_of_interest:
    other_variables_of_interest.remove(target_variable)


In [None]:
if target_variable and other_variables_of_interest:
    summary_stats.summary_statistics(df[[target_variable] + other_variables_of_interest])
elif target_variable:
    print(summary_stats.summary_statistics(df[[target_variable]]))
else:
    print(summary_stats.summary_statistics(df))


In [None]:
model = models.RegressionModel()

try:
    if target_variable and other_variables_of_interest:
        model.split_data(df=df, variables=other_variables_of_interest, target=target_variable)
    model.train()
    model.predict(['6.28', '12.65', '18.45'])
except Exception as e:
    print(e)

In [None]:
visualiser = visualisations.Visualization(df, variables=other_variables_of_interest, target=target_variable)

visualiser.plot_all()