In [1]:
from app import data_imports
from app import data_cleaning
from app import summary_stats   
from app import models
from app import visualisations

# Where is the dataset?

In [2]:
filepath = 'examples/happiness_2019.csv'

In [3]:
# SYSTEM CODE

import os

if os.path.isfile(filepath):
    print('\033[92;1m' + 'File successfully found!' + '\033[0m')
else:
    raise SystemExit("File not found. Stopping execution.")


[92;1mFile successfully found![0m


In [4]:
# SYSTEM CODE

df = data_imports.read_file(filepath)

cleaner = data_cleaning.DataCleaner(df)

cleaner.clean()

print(cleaner.get_log())

na_handling: There are 0 missing values in the DataFrame.
na_handling: Removed 0 rows with missing values.
na_handling: Changed from (156, 9) to (156, 9). Replaced 0 missing values using strategy 'remove'.
remove_duplicates: Changed from (156, 9) to (156, 9). Removed 0 duplicate rows.


In [5]:
# SYSTEM CODE

cols = df.columns.tolist()

cols_str = '\n'.join([f"- {col} ({df[col].dtype})" for col in cols])
print("The columns in the dataset are:\n")
print(cols_str)

The columns in the dataset are:

- Overall rank (int64)
- Country or region (object)
- Score (float64)
- GDP per capita (float64)
- Social support (float64)
- Healthy life expectancy (float64)
- Freedom to make life choices (float64)
- Generosity (float64)
- Perceptions of corruption (float64)


# Do you have interest in any specific variable?

It is not practical to consider all relationships at the same time. 

**You do not have to specify any variables at all.** The explanatory statistics will then be applied to all variables at once. Note that the modelling will, however, be limited.

In [7]:
# e.g. target_variable = 'price'
target_variable = ''

# e.g. other_variables_of_interest = ['sqft_living', 'bedrooms', 'bathrooms']
other_variables_of_interest = []#'b', 'lstat', 'rm']

In [8]:
# SYSTEM CODE

if not target_variable and not other_variables_of_interest:
    print('\033[91;1m' + "Both target_variable and other_variables_of_interest are empty." + '\033[0m')
elif not target_variable and other_variables_of_interest:
    print('\033[91;1m' + "Target variable has not been specified. Proceeding with the first column in the dataset." + '\033[0m')
    target_variable = df.columns[0] 
elif target_variable and not other_variables_of_interest:
    other_variables_of_interest = df.columns.tolist()
    print('\033[91;1m' + "Other variables of interest have not been specified. Proceeding with the whole dataset." + '\033[0m')

if target_variable in other_variables_of_interest:
    other_variables_of_interest.remove(target_variable)


[91;1mBoth target_variable and other_variables_of_interest are empty.[0m


In [None]:
# SYSTEM CODE

if target_variable and other_variables_of_interest:
    summary_stats.numeric_statistics(df[[target_variable] + other_variables_of_interest])
    summary_stats.categorical_statistics(df[[target_variable] + other_variables_of_interest])
elif target_variable:
    print(summary_stats.numeric_statistics(df[[target_variable]]))
    print(summary_stats.categorical_statistics(df[[target_variable]]))
else:
    print(summary_stats.numeric_statistics(df))
    print(summary_stats.categorical_statistics(df))


In [None]:
# SYSTEM CODE

visualiser = visualisations.Visualization(df, variables=other_variables_of_interest, target=target_variable)

visualiser.plot_all()

In [None]:
from app import relationship_detection

relationship_summary = relationship_detection.analyze_relationships(df)
for pair, (type_pair, value) in relationship_summary.items():
    print(f"Relationship between {pair[0]} and {pair[1]} ({type_pair}): {value}")

In [None]:
model_type=''#random_forest', lasso, ridge
search_method=''#grid_search, random_search,bayesian_optimization, none (for e.g. linear regression)
# e.g. target_variable = 'price'
target = ''

# e.g. other_variables_of_interest = ['sqft_living', 'bedrooms', 'bathrooms']
modeling_variables= []#'b', 'lstat', 'rm']

In [None]:
# SYSTEM CODE
from app import models

model = models.RegressionModel(model_type=model_type, hyperpar_grid={'n_estimators': [10,20,30,150], 'max_depth': [None, 10, 20,30]},cv = 10, n_iter=15, random_state=42)
model.split_data(df, variables=modeling_variables, target=target, test_size=0.2)
model.train(search_method=search_method)

mse = model.evaluate()
print(f"Mean Squared Error: {mse}")



In [None]:
# for custom predictions
model.predict([1,2,3,4,5,6,7,8,9,10]) # replace with your own values, TODO: figure out imports and implement visualisations