In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics as metrics 
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression, f_classif
from sklearn.model_selection import train_test_split
from IPython.display import clear_output
from statsmodels.tsa.seasonal import seasonal_decompose
import re
import time, sys, os
from datetime import datetime

# fix predictive modeling and finishing help page and that should be it. 

### Functions
def delayprint(string1): 
    for c in string1:
        sys.stdout.write(c)
        sys.stdout.flush()
        time.sleep(.01)
    print()
# Credit to Andrew Walker https://stackoverflow.com/questions/9246076/how-to-print-one-character-at-a-time-on-one-line

In [25]:
def calcSeason(df,day, month, sale):
    df_agg = df.groupby([df[month], df[day]]).mean().reset_index()
    current_year = datetime.now().year
    df_agg['date'] = pd.to_datetime({
        'year': current_year,
        'month': df_agg[month],
        'day': df_agg[day]
    })
    result = seasonal_decompose(df_agg[sale], model='additive', period=12)  # assuming seasonality period of 12 months

    # Plotting the decomposed components
    plt.figure(figsize=(18, 16))

    # Plotting the observed data
    plt.subplot(411)
    plt.plot(df_agg['date'], df_agg[sale], label='Original', color='orange')
    plt.title('Original Time Series')
    plt.xlabel('Date')
    plt.ylabel('Target')
    plt.legend()

    # Plotting the trend component
    plt.subplot(412)
    plt.plot(df_agg['date'], result.trend, label='Trend', color='blue')
    plt.title('Trend Component')
    plt.xlabel('Date')
    plt.ylabel('Trend')
    plt.legend()

    # Plotting the seasonal component
    plt.subplot(413)
    plt.plot(df_agg['date'], result.seasonal, label='Seasonal', color='green')
    plt.title('Seasonal Component')
    plt.xlabel('Date')
    plt.ylabel('Seasonal')
    plt.legend()

    # Plotting the residual component
    plt.subplot(414)
    plt.plot(df_agg['date'], result.resid, label='Residuals', color='red')
    plt.title('Residual Component')
    plt.xlabel('Date')
    plt.ylabel('Residuals')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [26]:
def selectData():
    csv = input("Please enter the data file you are using |EX: data.csv|:  ")
    df = pd.read_csv(csv)
    df = df.dropna()
    return df

In [27]:
def findDateColumns(df):
    columnNames = df.columns.tolist()
    dateColumns = []
    for name in columnNames:
        if re.search(r'\b(?:.*month.*|.*day.*|.*year.*|.*date.*)\b', name, re.IGNORECASE):
            dateColumns.append(name)
    if dateColumns == []:
        print("No date-like column detected, this data set may not be recommended for seasonality.")
    return dateColumns

In [28]:
def findSalesColumns(df):
    columnNames = df.columns.tolist()
    salesColumns = []
    for name in columnNames:
        if re.search(r'\b(?:.*sale.*|.*daily.*|.*money.*|.*earnings.*|.*profit.*|.*price.*|.*cost.*|.*cancel.*|.*booking.*)\b', name, re.IGNORECASE):
            salesColumns.append(name)
    if salesColumns == []:
        print("No sales-like column detected, this data set may not be recommended for seasonality.")
    return salesColumns

In [29]:
def findTargetColumns(df):
    columnNames = df.columns.tolist()
    targetColumns = []
    for name in columnNames:
        if re.search(r'\b(?:.*sale.*|.*daily.*|.*money.*|.*earnings.*|.*profit.*|.*price.*|.*cost.*|.*cancel.*|.*booking.*)\b', name, re.IGNORECASE):
            targetColumns.append(name)
    if targetColumns == []:
        print("No target-like column detected, this data set may not be recommended for deciding factors.")
    return targetColumns

In [30]:
def findBestFeatures(df, target_column, k):
    x = df.drop(columns=[target_column])
    y = df[target_column]

    best_features = SelectKBest(score_func=f_regression, k=k)
    fit = best_features.fit(x, y)

    feature_scores = pd.DataFrame({"Features": x.columns, "Score": fit.scores_})

    topFeatures = feature_scores.nlargest(k, "Score")
    
    return topFeatures

In [31]:
def countFeatures(df, target_column):
    numF = len(df.columns)-1 if target_column in df.columns else len(df.columns)
    return numF

In [32]:
from sklearn.impute import SimpleImputer
def prediction(model, topFeatures, df):
    userInputValues = {}
    for feature in topFeatures["Features"]:
        minVal = df[feature].min()
        maxVal = df[feature].max()
        print(f"| Recommended Value Range: {minVal}-{maxVal} |")
        userInputValues[feature] = float(input(f"Please enter value for {feature}: "))


    featureNames = topFeatures["Features"].tolist()
    xInput = pd.DataFrame([userInputValues], columns = featureNames)
    imputer = SimpleImputer(strategy = "mean")
    xInputImputer = imputer.fit_transform(xInput)
    predicted = model.predict(xInputImputer)

    return predicted

In [33]:
def exit():
    exit = True
    while exit == True:
        ans1 = input("Please type [e]xit to return to the main menu: ")
        if ans1.lower() == "e":
            clear_output()
            exit = False
        else:
            print("Invalid input, please try again.") 

In [35]:
def main():
    ans = True
    while ans == True:
        print("Hotel Analytics Generator")
        print("----------------")
        delayprint("1. Seasonality Report")
        delayprint("2. Predictive Model") 
        delayprint("3. Deciding Factors")
        delayprint("4. Help Info")
        delayprint("5. Exit")
        print("----------------")
        print()
        option = input("Please enter the number beside the option you'd like to execute: ")


        if option == "1":
            clear_output()
            print("Seasonality Report")
            print("----------------")
            delayprint("Recommended testing sets (with .csv): | booking_set_1 | booking_set_2 |")
            df = selectData()
            delayprint("These are some recommended columns that may be needed:")
            delayprint(f"Possible sales columns: {findSalesColumns(df)}")
            delayprint(f"Possible date columns: {findDateColumns(df)}")
            monthName = input("Please enter the name of the month column of your data: ")
            dayName = input("Please enter the name of the day of month column of your data: ")
            
            salesNum = input("Please enter the name of a measureable daily/weekly/monthly sales value column: ")
            graph = calcSeason(df, dayName, monthName, salesNum)
            plt.show(graph)
            exit()

        elif option == "2": 
            clear_output()
            print("Predictive Model")
            print("----------------")
            delayprint("Recommended testing sets (with .csv): | booking_set_1 | booking_set_2 |")
            df = selectData()
            delayprint(f"Possible target columns: {findTargetColumns(df)}")
            target = input("Please enter your desired target column: ")
            topFeatures = findBestFeatures(df, target, 5)
            x = df[topFeatures["Features"]]
            y = df[target]
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
            model = LinearRegression()
            model.fit(x_train, y_train)
            delayprint("DISCLAIMER: When entering numerical data, be sure to use 0 or 1 for most categories. (Please view data table for a general idea)")
            print(f"The prediction for {target} is: {prediction(model, topFeatures, df)}")
            exit()

        elif option == "3": 
            clear_output()
            print("Deciding Factors")
            print("----------------")
            delayprint("Recommended testing sets (with .csv): | booking_set_1 | booking_set_2 | is_canceled_1 | is_canceled_2|")
            df = selectData()
            delayprint(f"Possible target columns: {findTargetColumns(df)}")
            target = input("Please enter your desired target column: ")
            delayprint(f"There are {countFeatures(df, target)} features other than {target} in this dataframe.")
            numOfF = int(input("Please enter the number of features you'd like to rank (recommended <= 15): "))
            print(findBestFeatures(df, target, numOfF))
            exit()
            


        elif option == "4":
            help = True
            while help == True:
                clear_output()
                print("Help Page (Descriptions)")
                print("----------------")
                delayprint("1. Seasonality Report")
                delayprint("2. Predictive Model") 
                delayprint("3. Deciding Factors")
                delayprint("4. Exit")
                helpAns = input("Please enter the number beside the option to get a comprehensive description of the option.")

                if helpAns == "1":
                    print()
                    delayprint("The seasonality report will ask for 4 inputs. ")
                    print("|Input 1 = data.csv| ")
                    print("|Input 2 = month_column_name|")
                    print("|Input 3 = day_of_month_column_name|")
                    print("|Input 4 = measure_of_daily_sales_column_name|")
                    print("It will then generate 3 graphs based on the information entered showing trend, seasonal, and residual plots.")
                    exit()
                    print()
                
                elif helpAns == "2":
                    print()
                    delayprint("The predictive model will ask for 7 inputs.")
                    print("|Input 1 = data.csv| ")
                    print("|Input 2 = target_column|")
                    print("|Input 3 = best_feature_value_1|")
                    print("|Input 4 = best_feature_value_2|")
                    print("|Input 5 = best_feature_value_3|")
                    print("|Input 6 = best_feature_value_4|")
                    print("|Input 7 = best_feature_value_5|")
                    print("It will then predict the value of the target column based off of the values the user has input based off of the top 5 recommended values.")
                    exit()
                    print()
                elif helpAns == "3":
                    print()
                    delayprint("The deciding factors option assists the user in finding the best features to any specific target.")
                    print("|Input 1 = data.csv| ")
                    print("|Input 2 = target_column|")
                    print("|Input 3 = num_of_features_you_want| (Not recommended to go above 15)")
                    print("It will take the target input and rank the number of features selected in input 2.")
                    exit()
                    print()

                elif helpAns == "4":
                    help = False
                    delayprint("Exiting.......")
                    clear_output()
                    
                else:
                    clear_output()
                    print()
                    print("Invalid input, please try again.") 
                    print()


        elif option == "5":
            ans = False
            print()
            clear_output()
            delayprint("Exiting.......")
            time.sleep(1) 
            print("Goodbye!")
        else:
            clear_output()
            print()
            print("Invalid input, please try again.") 
            print()
    

main()

Exiting.......
Goodbye!
